Prepare stomach content data v.2

Author

Viktor Thunell & Max Lindmark

Published

July 4, 2025

Introduction

This script prepares stomach content data from three sources: 1) the ICES stomach content database with data from the Baltic Sea, 2) a copy the previous version of the ICES stomach content database and 3) recent Swedish observations not yet uploaded to the ICES database.

The output df contains one row per unique predator, their total and prey gruoup specific relative prey weight (rpw) along with necessary information used in analysis (coorinates, time etc.). To get there, in short, we summarise prey information per predator individual an combine the datasets. Furthermore, we add taxonomy (from the World register of marine species, Worms) and taxonomic rank specific length-weight relationships (using published data and [Fishbase])()), add weight predator or prey information when there is length available and then we do some additional cleaning (see below).

The stomach content database for the Baltic sea was downloaded twice. We use data for Lativa dowloaded at 16:21 Jan 15 2024 and for all other countries, year 1963-2021) was downloaded

Load libraries

pkgs <- c("tidyverse", "tidylog", "devtools", "sdmTMB", "RCurl", "purrr", "janitor", "patchwork", "scales", "mapplots", "worrms", "fuzzyjoin", "rfishbase", "furrr", "broom", "mgcv", "tidygam", "terra", "rnaturalearth","rnaturalearthdata")


if(length(setdiff(pkgs, rownames(installed.packages()))) > 0){
  
  install.packages(setdiff(pkgs, rownames(installed.packages())), dependencies = T)
  
}

invisible(lapply(pkgs, library, character.only = T))

# Source code for map plots
devtools::source_url("https://raw.githubusercontent.com/VThunell/Lammska_cod-fr/main/R/functions/map-plot.R")

options(ggplot2.continuous.colour = "viridis")

# Set path
home <- here::here()

1. Read data

Stomach content data i) current data from ICES stomach database (new data base, NDB) ii) Old database (ODB) data (since many observations are missing from the NDB) iii) Additional newer data (SE)

Additional information i) BITS haul information from DATRAS ii) length-weight coefficients from publication

# New database data, NDB are divided into four sheets
# New data from January 2024 for Latvia: StomachContent_0115213707
# New data (October 2024) that contains less stomachs for Other countries: StomachContent_1021080172

fi <- read_csv(paste0(home, "/data/stomach/StomachContent_0115213707/File_information.csv")) |> filter(Country == "LV") |>
  bind_rows(read_csv(paste0(home, "/data/stomach/StomachContent_1021080172/File_information.csv")) |> filter(!Country == "LV"))
Rows: 71 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Country, CruiseID
dbl (2): tblUploadID, Reporting_organisation

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
filter: removed 9 rows (13%), 62 rows remaining
Rows: 12 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Country, CruiseID
dbl (2): tblUploadID, Reporting_organisation

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
filter: no rows removed
# For the other three sheets for the NDB we use the tblUploadID for excluding and including LV.
hi <- read_csv(paste0(home, "/data/stomach/StomachContent_0115213707/HaulInformation.csv"),
               col_types = list(
                 tblUploadID = col_double(),
                 tblHaulID = col_double(),
                 Ship = col_character(),
                 Gear = col_character(),
                 HaulNo = col_double(),
                 StationNumber = col_double(),
                 Year = col_double(),
                 Month = col_double(),
                 Day = col_double(),
                 Time = col_character(),
                 ShootLat = col_double(),
                 ShootLong = col_double(),
                 HaulLat = col_double(),
                 HaulLong = col_double(),
                 ICESrectangle = col_character(),
                 Depth = col_double(),
                 Survey = col_character(),
                 ICESDatabase = col_character(),
                 Notes =  col_character())) |> # the parsing issues of hi causes no problems but specifying col_types corrects the datatypes 
  filter(tblUploadID %in% (fi |> filter(Country == "LV") |> pull(tblUploadID))) |>
  bind_rows(read_csv(paste0(home, "/data/stomach/StomachContent_1021080172/HaulInformation.csv"),
               col_types = list(
                 tblUploadID = col_double(),
                 tblHaulID = col_double(),
                 Ship = col_character(),
                 Gear = col_character(),
                 HaulNo = col_double(),
                 StationNumber = col_double(),
                 Year = col_double(),
                 Month = col_double(),
                 Day = col_double(),
                 Time = col_character(),
                 ShootLat = col_double(),
                 ShootLong = col_double(),
                 HaulLat = col_double(),
                 HaulLong = col_double(),
                 ICESrectangle = col_character(),
                 Depth = col_double(),
                 Survey = col_character(),
                 ICESDatabase = col_character(),
                 Notes =  col_character())) |>
              filter(tblUploadID %in% (fi |> filter(!Country == "LV") |> pull(tblUploadID)))) 
filter: removed 12 rows (16%), 62 rows remaining
filter: removed 203 rows (13%), 1,363 rows remaining
filter: removed 62 rows (84%), 12 rows remaining
filter: no rows removed
pred <- read_csv(paste0(home, "/data/stomach/StomachContent_0115213707/PredatorInformation_edit.csv")) |> # edited in text editor (not excel or equivalent) to remove three instances of erroneous '"' in column 'Notes' at row 7139,7144 and 7296. 
  filter(tblUploadID %in% (fi |> filter(Country == "LV") |> pull(tblUploadID))) |>
  bind_rows(read_csv(paste0(home, "/data/stomach/StomachContent_1021080172/PredatorInformation.csv")) |> filter(tblUploadID %in% (fi |> filter(!Country == "LV") |> pull(tblUploadID))))
Rows: 36980 Columns: 30
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (9): Ship, Gear, Time, Code, Sex, MaturityScale, PreservationMethod, Ge...
dbl (18): tblUploadID, tblHaulID, tblPredatorInformationID, HaulNo, StationN...
lgl  (3): StomachFullness, FullStomWgt, EmptyStomWgt

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
filter: removed 12 rows (16%), 62 rows remaining
filter: removed 3,504 rows (9%), 33,476 rows remaining
Rows: 5801 Columns: 30
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (9): Ship, Gear, Time, Code, Sex, MaturityScale, PreservationMethod, Ge...
dbl (18): tblUploadID, tblHaulID, tblPredatorInformationID, HaulNo, StationN...
lgl  (3): StomachFullness, FullStomWgt, EmptyStomWgt

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
filter: removed 62 rows (84%), 12 rows remaining
filter: no rows removed
prey <- read_csv(paste0(home, "/data/stomach/StomachContent_0115213707/PreyInformation.csv")) |>
  filter(tblUploadID %in% (fi |> filter(Country == "LV") |> pull(tblUploadID))) |>
  bind_rows(read_csv(paste0(home, "/data/stomach/StomachContent_1021080172/PreyInformation.csv")) |> filter(tblUploadID %in% (fi |> filter(!Country == "LV") |> pull(tblUploadID))))
Rows: 50170 Columns: 30
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (9): Ship, Gear, Time, IdentMet, GravMethod, UnitWgt, UnitLngt, OtherIt...
dbl (21): tblUploadID, tblHaulID, tblPredatorInformationID, tblPreyInformati...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
filter: removed 12 rows (16%), 62 rows remaining
filter: removed 8,326 rows (17%), 41,844 rows remaining
Rows: 12391 Columns: 30
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (9): Ship, Gear, Time, IdentMet, GravMethod, UnitWgt, UnitLngt, OtherIt...
dbl (21): tblUploadID, tblHaulID, tblPredatorInformationID, tblPreyInformati...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
filter: removed 62 rows (84%), 12 rows remaining
filter: no rows removed
# Old database data (ODB)
ODB <- read_csv(paste0(home, "/data/stomach/StomachDataOld.csv")) # from Stefan via Nis via Max
Rows: 574684 Columns: 52
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (16): Dataset, Country, Ship, Estimated_Lat_Lon, ICES_StatRec, Month, Da...
dbl (21): Latitude, Longitude, Year, Depth, Temperature, SampleNo(FishID), I...
lgl (15): RecordType, ICES_AreaCode, Predator_Age(mean), Predator_LowerLengt...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# New SE data
SE <- read_csv(paste0(home, "/data/stomach/full_stomach_data_x.csv")) # From Max
Rows: 92868 Columns: 38
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (12): prey_latin_name, predator_code, pred_id, prey_number_type, prey_w...
dbl  (25): year, month, pred_weight_g, pred_length_cm, gall_bladder, prey_we...
date  (1): date

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# BITS survey data to improve coordinates in ODB
# bits_hh <- getDATRAS(record = "HH", survey = "BITS", years = 1963:2022, quarters = c(1,2,3,4))
# write_csv(bits_hh, paste0(home, "/data/DATRAS_exchange/bits_hh.csv"))
bits_hh <- read_csv(paste0(home, "/data/survey/DATRAS_exchange/bits_hh.csv"))
Rows: 18105 Columns: 69
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (13): RecordType, Survey, Country, Ship, Gear, GearEx, StNo, DayNight, S...
dbl (48): Quarter, SweepLngt, HaulNo, Year, Month, Day, TimeShot, DepthStrat...
lgl  (8): DoorType, Rigging, Tickler, SecchiDepth, Turbidity, TidePhase, Tid...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# North sea length-weight coefficients
# a and bs for benthic species in the North sea: https://doi.org/10.1017/S0025315409991408
NSab <- read_csv(paste0(home, "/data/stomach/length-weight-Robinson_2010.csv"))
New names:
Rows: 216 Columns: 13
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(5): Species, Phylum, Class, Order, Length dbl (7): Min L (mm), Max L (mm), a,
b, r2, P, N lgl (1): ...13
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `` -> `...13`

2. Prey taxonomic summary and length-weight coefficients for all cod stomach prey

From all unique prey AphiaIDs and Scientific names in the different databases we create a full taxonomic list of Baltic cod prey species using the r package worrms. We then use this to add length weight parameters where possible (a & b from literature, fishbase or estimated), we can then join in accepted Scientific names or AphiaIDs to all prey species in the dbs.

Some AphiaIDs turns out to be wrong though…

# Where we have both AphiaID and Scientificname, we test whether these match in worms using function to get name from aphia ID. When they dont, we that the AID is wrong because the AID should come after the name in the laboratory identification-enter-in-db-process and therefore less likley to be wrong.

# Function for getting AphiaIDs from Scientific names
# getNamefAid <- function(Prey_LatinName, Prey_AphiaID, ...)  {
#
#    ScientificName = tryCatch(wm_id2name(id = Prey_AphiaID), error = function(e){
#           message("Scientific name missing, returning NA:\n", e)
#      NA
#         }
#    )
#    return(tibble(ScientificName, Prey_LatinName, Prey_AphiaID)) # preserve ScientificNames for matching in db later
#     }
#
# ODBcheck <- ODB |>  filter(Predator_LatinName == "Gadus morhua",
#                            Longitude > 13) |>
#   dplyr::select(Prey_LatinName, Prey_AphiaID) |>
#   distinct() |>
#   pmap(possibly(getNamefAid, otherwise = NULL))  |>
#   list_rbind()
#
# ODBcheck |>
#   filter(Prey_LatinName != ScientificName)

# For these, we skip the AID and go for Names instead. So if both Prey_LatinName and Prey_AphaID is present, we use name for getting records and LW as and bs.

# An alternative is to get names for all AID:s in the ODB (more than 300 000) and then remove the data where names doesn't match. But, many AphiaIDs are at the rank of e.g. genus och family of a species and is still sort of correct.

Get taxonomy

# # pick columns with species related info all datasets
# str(prey) # only AphiaIDPrey
# str(ODB) # Prey_AphiaID and Prey_LatinName
# str(SE) # prey_latin_name
# 
# # Remove AphiaIDs in the ODB where both scientific name and AID exists (see above). By doing this, we remove instances where the db_ScientificName is way off the correct AID.
# ODB_b <- ODB|>
#   mutate(Prey_AphiaID = if_else(!(is.na(Prey_LatinName) & is.na(Prey_AphiaID)), NA, Prey_AphiaID))
# 
# # Collect all AphiaIDs and latin names and clean names
# AIDLnames <- prey |>
#   filter(AphiaIDPredator == 126436) |>
#   dplyr::select(AphiaIDPrey) |>
#   mutate(db_ScientificName = NA) |>
#   rename(db_AphiaID = AphiaIDPrey) |>
#   bind_rows(SE |>
#               filter(predator_code == "COD") |>
#               dplyr::select(prey_latin_name) |>
#               rename(db_ScientificName = prey_latin_name) |>
#               mutate(db_AphiaID = NA)) |>
#   bind_rows(ODB_b |>
#               filter(Predator_LatinName == "Gadus morhua") |>
#               dplyr::select(Prey_LatinName, Prey_AphiaID) |>
#               rename(db_ScientificName = Prey_LatinName,
#                      db_AphiaID = Prey_AphiaID)) |>
#   distinct() |>
#   filter(!(is.na(db_ScientificName) & is.na(db_AphiaID)))
# 
# # We also use Stomach_Item in the ODB where there is prey length or weight but latin name is missing. Add those names to the list
# AIDLnames <- AIDLnames |>
#   bind_rows(ODB_b |> filter(is.na(Prey_LatinName), Prey_Length > 0 | !is.na(Prey_Length | Prey_Weight > 0)) |> distinct(Stomach_Item) |> rename(db_ScientificName = Stomach_Item))
# 
# AIDLnamesClean <- AIDLnames |>
#   mutate(db_CleanScientificName = make_clean_namess(db_ScientificName, allow_dupes = TRUE), # some names cleaning her and below
#          db_CleanScientificName = str_to_title(db_CleanScientificName),
#          db_CleanScientificName = str_remove_all(db_CleanScientificName,"//."),
#          db_CleanScientificName = str_remove(db_CleanScientificName,"2"),
#          db_CleanScientificName = str_remove_all(db_CleanScientificName," "),
#          db_CleanScientificName = str_remove_all(db_CleanScientificName,"  "),
#          db_CleanScientificName = str_replace_all(db_CleanScientificName,"_", " "),
#          db_CleanScientificName = if_else(str_ends(db_CleanScientificName, "sp"), str_remove(db_CleanScientificName,"sp"), db_CleanScientificName),
#          db_CleanScientificName = str_replace_all(db_CleanScientificName,"_", " ")) |>
#   mutate(db_CleanScientificName = if_else(db_CleanScientificName == "Na", NA, db_CleanScientificName))
# 
# # Function for getting AphiaIDs from Scientific names
# getAidfName <- function(db_ScientificName, db_CleanScientificName, ...)  {
# 
#    db_AphiaID = tryCatch(wm_name2id(name = db_CleanScientificName), error = function(e){
#           message("Scientific name missing, returning NA:\n", e)
#      NA
#         }
#    )
#    return(tibble(db_AphiaID, db_ScientificName, db_CleanScientificName)) # preserve db_ScientificNames for matching in db later
#     }
# 
# # apply function to ScientificName as AID:s seems to produce more matching records than names. Noe that many generates error: partial content. We need to fix those manually later.
# AIDLnamesClean2 <- AIDLnamesClean |>
#   filter(is.na(db_AphiaID)) |>
#   pmap(getAidfName) |>
#   list_rbind()
# 
# AIDLnamesClean3 <- AIDLnamesClean |>
#   filter(!is.na(db_AphiaID)) |>
#   bind_rows(AIDLnamesClean2)
# 
# AIDLnamesClean3 <- AIDLnamesClean3 |>
#   distinct() # 51 rows were duplicates, i.e. AID matched with names.
# # now we have a df with all AphiaIDs and Scientific names found in the databases with matching records. From this we can add to the respective database the valid AphiaID for each prey.
# # get records for the prey species list based on AphiaIDs (but remove NAs)
# getRecordsfAid <- function(db_AphiaID, db_ScientificName, db_CleanScientificName, ...)  {
#   if(is.numeric(db_AphiaID)) { # if db_AphiaID is NA, this is false
#       wm_record(id = db_AphiaID) |> tibble(db_AphiaID, db_ScientificName, db_CleanScientificName) }
#   else {
#     wm_records_common(name = db_CleanScientificName, fuzzy = TRUE) |> tibble(db_AphiaID, db_ScientificName, db_CleanScientificName) }
# }
# 
# # get records for the prey species list based on AphiaIDs (but remove NAs)
# preySpecies_records <- AIDLnamesClean3 |>
#   pmap(possibly(getRecordsfAid, otherwise = NULL))  |>
#   list_rbind()
# 
# # preySpecies_records lacks records of 48 db_ScientificNames (where db_AphiaID == NA) so lets try and fix the missing records with a fuzzy join of latin names
# preySpecies_records2 <- AIDLnamesClean3 |>
#   filter(is.na(db_AphiaID),
#          !is.na(db_CleanScientificName)) |> # one row with all columns NA
#   stringdist_left_join(preySpecies_records |> filter(!(is.na(AphiaID) | is.na(valid_name))) |>
#                          dplyr::select(-db_AphiaID, -db_ScientificName, -db_CleanScientificName),
#                        by = c(db_CleanScientificName = "valid_name")) |>
#   distinct()
# 
# # 48 records generated 53 which is good and we can add the non-NAs record to the list but we need to keep the db_Scientificnames so that these can be used to match records later. Theres a few dupes to remove
# preySpecies_records2 |>
#   rownames_to_column() |>
#   group_by(db_ScientificName) %>%
#   filter(n()>1)
# # and there are a few fuzzy joins that didnt end well, Scales are not Fucales, Mucus is not Fucus and Munida is not Mysida.
# preySpecies_records2 |>
#   filter(db_ScientificName != scientificname) |>
#   dplyr::select(db_ScientificName, scientificname)
# 
# # after checking we can remove [c(1,13,29,34,53), ] and then fuzzy-errors and then the missing record rows
# preySpecies_records3 <- preySpecies_records2 |>
#   slice(-c(1,13,29,34,53)) |>
#   filter(!db_ScientificName %in% c("Scales", "scales", "mucus", "Munida"),
#          !is.na(AphiaID))
# # reset "Scales", "scales", "mucus", "Munida" AphiaID to NA
# preySpecies_records2 <- preySpecies_records2 |>
#   mutate(AphiaID = if_else(db_ScientificName %in% c("Scales", "scales", "mucus", "Munida"), NA, AphiaID))
# 
# # for the remaining non-id:d and, we correct the organic animal prey (reomev plants and trash) manually and the others are NA.
# preySpecies_records2 |> slice(-c(1,13,29,34,53)) |> filter(is.na(AphiaID)) |> pull(db_ScientificName)
# 
# preySpecies_records4 <- preySpecies_records2 |> slice(-c(1,13,29,34,53)) |> filter(is.na(db_AphiaID)) |>
#   mutate(db_AphiaID = case_when(db_ScientificName %in% c("Pisces/Osteichthyes","Scales","scales", "Unidentified fish","Fish eggs","Pomatoschistus otholyth","Salmon stomach") ~ 152352,
#                                 db_ScientificName %in% c("Stone","Waste","Wood","Algae","stone","wood", "Aglae", "plastic", "carbon","Carbon","Plastic","Plastics","Litter/plastics", "", "Sand","Unidentified algae covered wit","Unidentified mass","Undefined mass") ~ NA,
#                                 db_ScientificName %in% c("remains","mucus","Remains","digestive tract","Unidentified remains","Spawn","Siphon","Entrails","Chicken bone","Zooplancton","Unidentified invertebrata","Unidentified worm") ~ 2,
#                                 db_ScientificName %in% c("Mytilus sp.","Mytilus sp") ~ 138228,
#                                 db_ScientificName == "Idotea sp." ~ 118454,
#                                 db_ScientificName == "Phaeophycophyta" ~ 830,
#                                 db_ScientificName == "Crangon" ~ 107007,
#                                 db_ScientificName == "Pectinaria sp." ~ 129437,
#                                 db_ScientificName == "Gobius niger" ~ 126892,
#                                 db_ScientificName == "Pleuronectoidei" ~ 125579,
#                                 db_ScientificName == "Corystes cassivelanus" ~ 107277,
#                                 db_ScientificName == "Sipunculoidea" ~ 1648,
#                                 db_ScientificName == "Cribrinidae" ~ 267346,
#                                 db_ScientificName == "Spirontocaris securifrons" ~ 107531,
#                                 db_ScientificName == "Spirontocaris affinis" ~ 106994,
#                                 db_ScientificName == "Ciliata mustella" ~ 126448,
#                                 db_ScientificName == "Ophiothricidae" ~ 123208,
#                                 db_ScientificName == "Leptocardia" ~ 104897,
#                                 db_ScientificName == "Brachyrhyncha" ~ 736748,
#                                 db_ScientificName == "Astarte procera" ~ 1735368,
#                                 db_ScientificName == "Paleonemertea" ~ 122307,
#                                 db_ScientificName == "Gadoidei" ~ 125469,
#                                 db_ScientificName %in% c("Clupeoidei","Clupeidae scales") ~ 125464,
#                                 db_ScientificName == "Gammarus sp."  ~  101537,
#                                 db_ScientificName == "Metridiidae" ~ 100678,
#                                 db_ScientificName == "Lampetra" ~ 101167,
#                                 db_ScientificName == "Hyperia" ~ 101796,
#                                 db_ScientificName == "Dromiacea" ~ 106678,
#                                 db_ScientificName == "Lucifer" ~ 106827,
#                                 db_ScientificName %in% c("Galathea sp.","Galathea") ~ 106834,
#                                 db_ScientificName == "Munida" ~ 106835,
#                                 db_ScientificName == "Porcellana" ~ 106838,
#                                 db_ScientificName == "Nephrops" ~ 106863 ,
#                                 db_ScientificName == "Atelecyclus rotundatus" ~ 107273,
#                                 db_ScientificName == "Asterias" ~ 123219,
#                                 db_ScientificName == "Cucumaria" ~ 123479,
#                                 db_ScientificName == "Gadus" ~ 125732,
#                                 db_ScientificName == "Ciliata" ~ 125741,
#                                 db_ScientificName %in% c("Trachurus","Horse mackerel") ~ 125946,
#                                 db_ScientificName == "Trachinus" ~ 126094,
#                                 db_ScientificName == "Owenia" ~ 129427,
#                                 db_ScientificName == "Solea" ~ 126132,
#                                 db_ScientificName == "Pectinaria" ~ 129437,
#                                 db_ScientificName == "Synchaeta" ~ 134958,
#                                 db_ScientificName == "Turritella" ~ 138615,
#                                 db_ScientificName == "Polydora" ~ 129619 ,
#                                 db_ScientificName == "Macropipus dupurator" ~ 156228,
#                                 db_ScientificName == "Gnathostomata" ~ 1828,
#                                 db_ScientificName == "Cardium edule" ~ 1600680,
#                                 db_ScientificName == "Pontophilus gracilis" ~ 246192,
#                                 db_ScientificName == "Pectinaria californiensis" ~ 337505,
#                                 db_ScientificName == "Nereis diversicolor" ~ 152302,
#                                 db_ScientificName == "Chlamys" ~ 138315,
#                                 db_ScientificName == "Asterias rubens" ~ 123776,
#                                 db_ScientificName == "Littorina planaxis" ~ 445651,
#                                 db_ScientificName == "Liparis" ~ 126160,
#                                 db_ScientificName == "Ophiothrix" ~ 123626,
#                                 db_ScientificName == "Anguilla vulgaris" ~ 126281,
#                                 db_ScientificName == "Phyllodoce maculata" ~ 334510,
#                                 db_ScientificName == "Palaemon sp." ~ 107032,
#                                 db_ScientificName == "Pectinaria sp." ~ 129437,
#                                 db_ScientificName == "Astropecten duplicatus" ~ 178650,
#                                 db_ScientificName == "Amphitrite" ~ 129686,
#                                 db_ScientificName == "ASTARTE" ~ 137683,
#                                 db_ScientificName == "Pecten raveneli" ~ 394070,
#                                 db_ScientificName == "Phyllodoce" ~ 129455,
#                                 db_ScientificName == "Terebellidae" ~ 982,
#                                 db_ScientificName == "Herring" ~ 126417,
#                                 db_ScientificName %in% c("Sprat","Sprat eggs","sprat") ~ 126425,
#                                 db_ScientificName %in% c("Cod","Cod Eggs","Cod stomach","Filet of Cod") ~ 126436,
#                                 db_ScientificName == "Four-bearded rockling" ~ 126450,
#                                 db_ScientificName == "Flounder" ~ 125579,
#                                 db_ScientificName == "Whiting" ~ 126438,
#                                 db_ScientificName == "Stickleback" ~ 1617137,
#                                 db_ScientificName == "Nine-spined stickleback" ~ 126507,
#                                 db_ScientificName %in% c("Liocarcinus sp.","liocarcinus sp.") ~ 106925,
#                                 db_ScientificName == "Pagurus sp." ~ 106854,
#                                 db_ScientificName == "Fifteen-spined stickleback" ~ 125777,
#                                 db_ScientificName == "Pasiphaea sp." ~ 107052,
#                                 db_ScientificName == "Zoarces viviparus eggs" ~ 127123,
#                                 db_ScientificName == "Myoxocephalus quadricornis egg" ~ 254529,
#                                 db_ScientificName == "Round goby" ~ 126916,
#                                 db_ScientificName == "Enchelyopus eggs" ~ 125742,
#                                 db_ScientificName == "Lycodes sp." ~ 126104,
#                                 db_ScientificName == "Corophium sp." ~ 101489,
#                                 db_ScientificName == "Cerastoderma sp." ~ 137735,
#                                 db_ScientificName == "Broad-nosed pipefish" ~ 127393,
#                                 db_ScientificName == "Saduria entomon eggs" ~ 119034,
#                                 db_ScientificName == "Macoma sp." ~ 138531,
#                                 db_ScientificName == "Taurulus bubalis eggs" ~ 127204,
#          .default = NA),
#          db_AphiaID = as.integer(db_AphiaID))
# 
# # get records for the fixed AIDs, reuse preySpecies as name for the df to use getRecordsAid() function
# preySpecies_records5 <- preySpecies_records4 |>
#   dplyr::select(db_AphiaID, db_ScientificName, db_CleanScientificName) |>
#   pmap(possibly(getRecordsfAid, otherwise = NULL))  |>
#       list_rbind()
# 
# leftover_prey <- preySpecies_records4 |> filter(is.na(db_AphiaID) & is.na(AphiaID))
# # bind preySpeciesrecords with preySpeciesrecords3 and preySpeciesrecords5
# 
# preySpecies_records6 <- preySpecies_records |>
#   bind_rows(preySpecies_records3, preySpecies_records5, leftover_prey)
# 
# preySpecies_records6 |>
#   filter(db_CleanScientificName != scientificname) |>
#   group_by(db_ScientificName) |>
#   filter(n()>1)
# 
# # remove the duplicates and give it a final name
# preyTaxamatch <- preySpecies_records6 |>
#   filter(!(is.na(db_AphiaID) & db_ScientificName %in% c("Gammarus sp.", "Palaemon sp.")))
# 
# saveRDS(preyTaxamatch, paste0(home, "/data/stomach/preyTaxamatch.rds"))
preyTaxamatch <- readRDS(paste0(home, "/data/stomach/preyTaxamatch.rds"))

Length-weight parameters (a and b) to prey summary

length-weight relationships retrived from and Fishbase using rfishbase.

# # Using NSab where coefficients are for length in mm while fishbase is for cm. We´ll correct for this below
# preyTaxamatch_Invert <- preyTaxamatch |>
#   filter(!phylum == "Chordata",
#          !is.na(scientificname)
#          ) |>
#   stringdist_left_join(NSab |> dplyr::select("Species", "a", "b"), by = c(scientificname = "Species")) |>
#   mutate(abTaxlevel = if_else(is.na(a), NA, "species"))
# 
# # get mean a and b per order
# NSab_order <- NSab |> summarise(a = mean(a),  b = mean(b),
#                   .by = c(Order))
# 
# preyTaxamatch_Invert2 <- preyTaxamatch_Invert |>
#   filter(is.na(a)) |> dplyr::select(-a,-b) |>
#   left_join(NSab_order |> dplyr::select("Order", "a", "b"), join_by("order"=="Order")) |>
#   mutate(abTaxlevel = if_else(is.na(a), NA, "order")) |>
#   bind_rows(preyTaxamatch_Invert |> filter(!is.na(a)))
# 
# # still more than half of a and bs missing...
# preyTaxamatch_Invert2 |> filter(is.na(a))
# 
# # use class instead of order
# NSab_class <- NSab |> summarise(a = mean(a),  b = mean(b),
#                   .by = c(Class))
# 
# preyTaxamatch_Invert3 <- preyTaxamatch_Invert2 |>
#   filter(is.na(a)) |> dplyr::select(-a,-b) |>
#   left_join(NSab_class |> dplyr::select("Class", "a", "b"), join_by("class"=="Class")) |>
#    mutate(abTaxlevel = if_else(is.na(a), NA, "class")) |>
#   bind_rows(preyTaxamatch_Invert2 |> filter(!is.na(a)))
# 
# # still lots (122) missing, but many classes are maybe not length-weight realtionship applicable, such as malcostraca, bivalvia and some other molluscs?
# preyTaxamatch_Invert3 |> filter(is.na(a))
# 
# # use phylum instead of class
# NSab_phylum <- NSab |> summarise(a = mean(a),  b = mean(b),
#                   .by = c(Phylum))
# 
# preyTaxamatch_Invert4 <- preyTaxamatch_Invert3 |>
#   filter(is.na(a)) |> dplyr::select(-a,-b) |>
#   left_join(NSab_phylum |> dplyr::select("Phylum", "a", "b"), join_by("phylum"=="Phylum")) |>
#   mutate(abTaxlevel = if_else(is.na(a), NA, "phylum")) |>
#   bind_rows(preyTaxamatch_Invert3 |> filter(!is.na(a)))
# 
# # still some missing
# preyTaxamatch_Invert4 |> filter(is.na(a))
# # Ok, thats enough, ever tried to messure a Priapulid?
# 
# # The NSabs are fitted for animals meassured in mm which they are not for fishbase (FSab),
# preyTaxamatch_Invert4 <- preyTaxamatch_Invert4 |>
#   mutate(a = a*(1/10)^b) # rescale since a and b are estimated in mm following the rule (ab)^c = a^c*b^c a so that we can break out 1/10 (mm to cm) and add it to the a scalar
# 
# # Now Chordates (i.e. fish) by using package rfishbase
# #length_weight("Gadus morhua")
# #as the rfishbase function length_weight returns multimple observations, we have to make a function # that both gets and summarises those
# getabsChord <- function(nam, ind)  {
# 
#   lwab = length_weight(nam) |>
#     filter(Type == "TL") |> # filter out total length measurements
#     summarise(a = mean(a), b = mean(b))
# 
#   bind_cols(preyTaxamatch_Chord[ind,], lwab)
# }
# 
# # then we can map that function with a furrr::future to improve speed as length_weight takes a while to run
# preyTaxamatch_Chord <- preyTaxamatch |> filter(phylum == "Chordata")
# 
# plan(multisession, workers = 3) # saves 40% time in a small test
# preyTaxamatch_Chord <- preyTaxamatch_Chord |>
#   pull(valid_name) |>
#  future_imap(\(x, idx) getabsChord(x, idx), .options = furrr_options(seed = T)) |>
#  list_rbind()
# plan(sequential)
# 
# preyTaxamatch_Chord <- preyTaxamatch_Chord |>
#  mutate(abTaxlevel = if_else(is.na(a), NA, "species"))
# 
# # lots of as and bs missing
# preyTaxamatch_Chord |> filter(is.na(a))
# 
# # use family instead of genus
# FBab_family <- preyTaxamatch_Chord |>
#   filter(!is.na(a)) |>
#   summarise(a = mean(a),  b = mean(b), .by = c(family))
# 
# preyTaxamatch_Chord2 <- preyTaxamatch_Chord |>
#   filter(is.na(a)) |>
#   dplyr::select(-a,-b) |>
#   left_join(FBab_family |> dplyr::select("family", "a", "b")) |>
#   mutate(abTaxlevel = if_else(is.na(a), NA, "family")) |>
#   bind_rows(preyTaxamatch_Chord |> filter(!is.na(a)))
# 
# # left over fish and tunicates
# preyTaxamatch_Chord2 |> filter(is.na(a))
# 
# # lets use mean a and b for the chordates for these
# cho_meanab <- preyTaxamatch_Chord2 |>
#   summarise(mean_a = mean(a, na.rm=TRUE), mean_b = mean(b, na.rm=TRUE))
# 
# preyTaxamatch_Chord2 <- preyTaxamatch_Chord2 |>
#   mutate(a = if_else(is.na(a), cho_meanab$mean_a, a),
#          b = if_else(is.na(b), cho_meanab$mean_b, b))
# 
# # Join in join in invertebrates
# preyTaxamatch_LW <- preyTaxamatch_Chord2 |>
#   bind_rows(preyTaxamatch_Invert4 |> dplyr::select(-Species))
# 
# preyTaxamatch_LW |>
#   summarise(meana = mean(a, na.rm = TRUE), by = abTaxlevel)
# 
# saveRDS(preyTaxamatch_LW, paste0(home, "/data/stomach/preyTaxamatch_LW.rds"))
preyTaxamatch_LW <- readRDS(paste0(home, "/data/stomach/preyTaxamatch_LW.rds"))

preyTaxamatch_LW |>
  filter(is.na(a))
filter: removed 568 rows (94%), 36 rows remaining
# A tibble: 36 × 33
   AphiaID url        scientificname authority status unacceptreason taxonRankID
     <int> <chr>      <chr>          <chr>     <chr>  <chr>                <int>
 1  101156 https://w… Halicryptus s… von Sieb… accep… <NA>                   220
 2  101160 https://w… Priapulus cau… Lamarck,… accep… <NA>                   220
 3  144129 https://w… Fucus          Linnaeus… accep… <NA>                   180
 4  101063 https://w… Priapulida     Théel, 1… accep… <NA>                    30
 5     799 https://w… Nematoda       <NA>      accep… <NA>                    30
 6    1248 https://w… Ctenophora     Eschscho… accep… <NA>                    30
 7  101156 https://w… Halicryptus s… von Sieb… accep… <NA>                   220
 8  101093 https://w… Halicryptus    von Sieb… accep… <NA>                   180
 9  101160 https://w… Priapulus cau… Lamarck,… accep… <NA>                   220
10  101156 https://w… Halicryptus s… von Sieb… accep… <NA>                   220
# ℹ 26 more rows
# ℹ 26 more variables: rank <chr>, valid_AphiaID <int>, valid_name <chr>,
#   valid_authority <chr>, parentNameUsageID <int>, kingdom <chr>,
#   phylum <chr>, class <chr>, order <chr>, family <chr>, genus <chr>,
#   citation <chr>, lsid <chr>, isMarine <int>, isBrackish <int>,
#   isFreshwater <int>, isTerrestrial <int>, isExtinct <int>, match_type <chr>,
#   modified <chr>, db_AphiaID <dbl>, db_ScientificName <chr>, …
preyTaxamatch_LW |>
  filter(!is.na(a)) |>
  pivot_longer(cols = c("a", "b"), values_to = "value", names_to = "params") |>
  ggplot() +
  geom_histogram(aes(value, fill = abTaxlevel)) + #, bins = 100) +
  facet_grid(params~abTaxlevel) +
  guides(fill="none")
filter: removed 36 rows (6%), 568 rows remaining
pivot_longer: reorganized (a, b) into (params, value) [was 568x33, now 1136x33]
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.

# two groupings of a and bs in the histograms above are due to phylum I think, fish have small a and invertebrates have a:s around 23
preyTaxamatch_LW |>
  filter(phylum %in% c("Chordata", "Arthropoda")) |>
  summarise(mean_a = mean(a, na.rm = TRUE), mean_b = mean(b, na.rm = TRUE), .by = phylum)
filter: removed 195 rows (32%), 409 rows remaining
summarise: now 2 rows and 3 columns, ungrouped
# A tibble: 2 × 3
  phylum      mean_a mean_b
  <chr>        <dbl>  <dbl>
1 Chordata   0.00947   3.03
2 Arthropoda 0.0541    2.71

3. New database data

names(fi)
[1] "tblUploadID"            "Country"                "Reporting_organisation"
[4] "CruiseID"              
names(hi)
 [1] "tblUploadID"   "tblHaulID"     "Ship"          "Gear"         
 [5] "HaulNo"        "StationNumber" "Year"          "Month"        
 [9] "Day"           "Time"          "ShootLat"      "ShootLong"    
[13] "HaulLat"       "HaulLong"      "ICESrectangle" "Depth"        
[17] "Survey"        "ICESDatabase"  "Notes"        
names(pred)
 [1] "tblUploadID"              "tblHaulID"               
 [3] "tblPredatorInformationID" "Ship"                    
 [5] "Gear"                     "HaulNo"                  
 [7] "StationNumber"            "Year"                    
 [9] "Month"                    "Day"                     
[11] "Time"                     "FishID"                  
[13] "AphiaIDPredator"          "IndWgt"                  
[15] "Number"                   "MeasurementIncrement"    
[17] "Length"                   "Code"                    
[19] "Age"                      "Sex"                     
[21] "MaturityScale"            "MaturityStage"           
[23] "PreservationMethod"       "Regurgitated"            
[25] "StomachFullness"          "FullStomWgt"             
[27] "EmptyStomWgt"             "StomachEmpty"            
[29] "GenSamp"                  "Notes"                   
names(prey)
 [1] "tblUploadID"              "tblHaulID"               
 [3] "tblPredatorInformationID" "tblPreyInformationID"    
 [5] "Ship"                     "Gear"                    
 [7] "HaulNo"                   "StationNumber"           
 [9] "Year"                     "Month"                   
[11] "Day"                      "Time"                    
[13] "FishID"                   "AphiaIDPredator"         
[15] "AphiaIDPrey"              "IdentMet"                
[17] "DigestionStage"           "GravMethod"              
[19] "SubFactor"                "PreySequence"            
[21] "Count"                    "UnitWgt"                 
[23] "Weight"                   "UnitLngt"                
[25] "Length"                   "OtherItems"              
[27] "OtherCount"               "OtherWgt"                
[29] "AnalysingOrg"             "Notes"                   

Join all data files (new db) in a specific order: fi -> hi -> pred -> prey.

For some joins, there are multiple column names shared in addition to the key that I could remove and keep only the ID key and the non-shared columns but instead I keep them. First I need to ensure they are the same (VT what is the same and how do you do this?), and not only have the same name. Will also check if both datasets have the same amount of NA before choosing which column to carry from which dataset.

hifi <- left_join(hi, fi, by = "tblUploadID") # join haul data with file info
left_join: added 3 columns (Country, Reporting_organisation, CruiseID)
           > rows only in hi      0
           > rows only in fi (    0)
           > matched rows     1,675
           >                 =======
           > rows total       1,675
comcol_hifi_pred <- intersect(colnames(pred), colnames(hifi))

# Check if any of the two datasets have NA in the common columns which can screw up joining dfs
unique(is.na(hifi |> dplyr::select(all_of(comcol_hifi_pred))))
     tblUploadID tblHaulID  Ship  Gear HaulNo StationNumber  Year Month   Day
[1,]       FALSE     FALSE FALSE FALSE  FALSE         FALSE FALSE FALSE FALSE
[2,]       FALSE     FALSE FALSE FALSE  FALSE         FALSE FALSE FALSE FALSE
      Time Notes
[1,] FALSE  TRUE
[2,] FALSE FALSE
unique(is.na(pred |> dplyr::select(all_of(comcol_hifi_pred))))
     tblUploadID tblHaulID  Ship  Gear HaulNo StationNumber  Year Month   Day
[1,]       FALSE     FALSE FALSE FALSE  FALSE         FALSE FALSE FALSE FALSE
[2,]       FALSE     FALSE FALSE FALSE  FALSE         FALSE FALSE FALSE FALSE
      Time Notes
[1,] FALSE  TRUE
[2,] FALSE FALSE
# The column Notes does have NAs and different meanings in hi and fi so we will remove Notes before joining
pred <- left_join(pred |> dplyr::select(-Notes), # join in predator data
                  hifi |> dplyr::select(-Notes),
                  by = comcol_hifi_pred[!comcol_hifi_pred == "Notes"])
left_join: added 11 columns (ShootLat, ShootLong, HaulLat, HaulLong, ICESrectangle, …)
           > rows only in dplyr::select(pred, -No..       0
           > rows only in dplyr::select(hifi, -No.. (     0)
           > matched rows                            39,277
           >                                        ========
           > rows total                              39,277

Fix pred regurgitated stomachs

# In the Swedish data, 2 means regurgitated and 1 is intact, but for the rest, 1 means regurgitated, 0 or NA means intact. All NA values are from 2020 and 2021 and for all regurguitated=0 before 2005 (see Neuenfeldt 2020 discussion, https://doi.org/10.1093/icesjms/fsz224). So we dont know if they are truly not regurgitated. Therefore there is not much info of value from the regurgitated column. 

pred |>
  filter(Country == "SE") |>
  mutate(regurg_f = as.factor(Regurgitated)) |>
  summarise(regy = n(), .by = c(Year, regurg_f))
filter: removed 38,653 rows (98%), 624 rows remaining
mutate: new variable 'regurg_f' (factor) with 3 unique values and 8% NA
summarise: now 3 rows and 3 columns, ungrouped
# A tibble: 3 × 3
   Year regurg_f  regy
  <dbl> <fct>    <int>
1  2021 1          572
2  2021 <NA>        48
3  2021 2            4
pred <- pred |>
  mutate(Regurgitated_st = Regurgitated,
         Regurgitated_st = if_else(Country == "SE" & Regurgitated == 1, 0, Regurgitated_st),
         Regurgitated_st = if_else(Country == "SE" & Regurgitated == 2, 1, Regurgitated_st),
         Regurgitated_st = replace_na(Regurgitated_st, 0))
mutate: new variable 'Regurgitated_st' (double) with 2 unique values and 0% NA
# Regurgitated stomachs are only about 2% and come from 2021, 2018 and 2013 (and 1 in 1981).
pred |>
  mutate(Haul_ID = paste(Country, Year, Month, Day, HaulNo, ICESrectangle)) |>
  summarise(count = n(), .by = c(Regurgitated_st, Year)) |>
  filter(Regurgitated_st == 1)
mutate: new variable 'Haul_ID' (character) with 1,674 unique values and 0% NA
summarise: now 55 rows and 3 columns, ungrouped
filter: removed 48 rows (87%), 7 rows remaining
# A tibble: 7 × 3
  Regurgitated_st  Year count
            <dbl> <dbl> <int>
1               1  1981     1
2               1  2013    12
3               1  2021   368
4               1  2014   129
5               1  2017    53
6               1  2018   195
7               1  2016   145
# There are regurgitated stomachs (column in pred) that have information in prey info, i.e. either its incorrect or they have signs of regurgitation but prey in the stomach. These are various prey types from 2018 to 2021. Lets remove these for the rpw analyses.
regurg_ids <- pred |>
  filter(Regurgitated_st == 1) 
filter: removed 38,374 rows (98%), 903 rows remaining
prey |> filter(tblPredatorInformationID %in% regurg_ids$tblPredatorInformationID) |> distinct(Weight, AphiaIDPrey, Year) |> as.data.frame()
filter: removed 53,890 rows (99%), 345 rows remaining
distinct: removed 338 rows (98%), 7 rows remaining
  Weight AphiaIDPrey Year
1     NA          NA 2013
2     NA          NA 2021
3   0.07      125464 2021
4     NA          NA 2014
5   0.17      293511 2014
6     NA          NA 2017
7     NA          NA 2016
# Remove regurgitated stomachs from pred and prey to reduce issues with the relative prey weight becoming incorrect.
pred2 <- pred |>
  filter(!tblPredatorInformationID %in% regurg_ids$tblPredatorInformationID)
filter: removed 903 rows (2%), 38,374 rows remaining
prey2 <- prey |>
  filter(!tblPredatorInformationID %in% regurg_ids$tblPredatorInformationID)
filter: removed 345 rows (1%), 53,890 rows remaining

Join in prey data

Now join predator data to prey data following the same procedure.

intersect(colnames(pred), colnames(prey))
 [1] "tblUploadID"              "tblHaulID"               
 [3] "tblPredatorInformationID" "Ship"                    
 [5] "Gear"                     "HaulNo"                  
 [7] "StationNumber"            "Year"                    
 [9] "Month"                    "Day"                     
[11] "Time"                     "FishID"                  
[13] "AphiaIDPredator"          "Length"                  
# Length is a common column, but it corresponds to predator or prey. Rename!
pred3 <- pred2 |> rename(pred_length = Length,
                       pred_weight = IndWgt)
rename: renamed 2 variables (pred_weight, pred_length)
prey3 <- prey2 |> rename(prey_length = Length,
                       prey_weight = Weight)
rename: renamed 2 variables (prey_weight, prey_length)
comcol_prey_pred <- intersect(colnames(pred3), colnames(prey3))

# Check if any of the two datasets have NA in the common columns
unique(is.na(pred3 |> dplyr::select(all_of(comcol_prey_pred))))
     tblUploadID tblHaulID tblPredatorInformationID  Ship  Gear HaulNo
[1,]       FALSE     FALSE                    FALSE FALSE FALSE  FALSE
     StationNumber  Year Month   Day  Time FishID AphiaIDPredator
[1,]         FALSE FALSE FALSE FALSE FALSE  FALSE           FALSE
unique(is.na(prey3 |> dplyr::select(all_of(comcol_prey_pred))))
     tblUploadID tblHaulID tblPredatorInformationID  Ship  Gear HaulNo
[1,]       FALSE     FALSE                    FALSE FALSE FALSE  FALSE
     StationNumber  Year Month   Day  Time FishID AphiaIDPredator
[1,]         FALSE FALSE FALSE FALSE FALSE  FALSE           FALSE
# Rename "Notes" in prey data to avoid confusion as to which dataset it belongs to
prey3 <- prey3 |> rename(prey_notes = Notes)
rename: renamed one variable (prey_notes)
# Join in pred3 info into prey. There are rows only in pred3 (y) that are not in prey. These are either empty, regurgitated or incorrect. Empties will be added later. We go from 50 000 prey to 62 000 obsevervations
new_db <- left_join(pred3, prey3, by = c(comcol_prey_pred))
left_join: added 17 columns (tblPreyInformationID, AphiaIDPrey, IdentMet, DigestionStage, GravMethod, …)
           > rows only in pred3  12,026
           > rows only in prey3 (     0)
           > matched rows        53,890    (includes duplicates)
           >                    ========
           > rows total          65,916

Fix prey weights etc

Calculate total weight of specific prey species by unique predator ID. For NA and zero weights, we estimate weight if length is present. If length is not NA and Weight is 0 or NA, estimate weight based on length and Count. Else give weight NA and drop it. Because these are not true empty, else there wouldn’t be species-information

# Estimate weight based on count. In the data, if count is >1, the weight is grouped.  In some cases, all of Weight, Count and prey_length is NA, i.e. there are presences of prey but no information for calculating the total weight or there is a length but no count (0) which we will treat as one and retrieve a weight, I think this is less wrong.

# NA length unit matches the number of missing lengths. Where we don't have a unit is where length is lacking.
sum(is.na(new_db$UnitLngt))/length(new_db$UnitLngt)
[1] 0.6835518
sum(is.na(new_db$prey_length))/length(new_db$prey_length)
[1] 0.6878603
# we should end up with a bit less than 50 000 prey items:
new_db |> filter(is.na(prey_weight) & is.na(Count) & is.na(prey_length))
filter: removed 52,606 rows (80%), 13,310 rows remaining
# A tibble: 13,310 × 58
   tblUploadID tblHaulID tblPredatorInformati…¹ Ship  Gear  HaulNo StationNumber
         <dbl>     <dbl>                  <dbl> <chr> <chr>  <dbl>         <dbl>
 1        8247      4705                  55121 90MZ  LBT        3             3
 2        8247      4706                  55123 90MZ  LBT        4             3
 3        8247      4707                  55133 90MZ  LBT        5             3
 4        8247      4707                  55141 90MZ  LBT        5             3
 5        8247      4707                  55155 90MZ  LBT        5             3
 6        8247      4707                  55156 90MZ  LBT        5             3
 7        8247      4691                  55162 90MZ  LBT        6             3
 8        8247      4691                  55171 90MZ  LBT        6             3
 9        8247      4691                  55172 90MZ  LBT        6             3
10        8247      4691                  55174 90MZ  LBT        6             3
# ℹ 13,300 more rows
# ℹ abbreviated name: ¹​tblPredatorInformationID
# ℹ 51 more variables: Year <dbl>, Month <dbl>, Day <dbl>, Time <chr>,
#   FishID <dbl>, AphiaIDPredator <dbl>, pred_weight <dbl>, Number <dbl>,
#   MeasurementIncrement <dbl>, pred_length <dbl>, Code <chr>, Age <dbl>,
#   Sex <chr>, MaturityScale <chr>, MaturityStage <dbl>,
#   PreservationMethod <chr>, Regurgitated <dbl>, StomachFullness <lgl>, …
#prey |> filter(is.na(prey_weight) & is.na(Count) & is.na(prey_length))

new_db |> # we assume that NA in Count equals 1 based on this freq distribution of 1 and NA
  summarise(n= n(), .by = Count) |>
  arrange(-n)
summarise: now 231 rows and 2 columns, ungrouped
# A tibble: 231 × 2
   Count     n
   <dbl> <int>
 1    NA 31103
 2     1 25755
 3     2  2692
 4     3  1349
 5     4   770
 6     5   566
 7     6   451
 8     7   300
 9     8   238
10     9   198
# ℹ 221 more rows
# compare weight of count = 1 and count = NA
new_db |> # The lower mean weight is probably becasue the prey was not intact.
  filter(is.na(Count) | Count == 1) |>
  mutate(Count = replace_na(Count, -1)) |>
  filter(prey_weight > 0) |>
  summarise(mean = mean(prey_weight), med = median(prey_weight),
            .by = Count)
filter: removed 9,058 rows (14%), 56,858 rows remaining
mutate: changed 31,103 values (55%) of 'Count' (31,103 fewer NAs)
filter: removed 13,441 rows (24%), 43,417 rows remaining
summarise: now 2 rows and 3 columns, ungrouped
# A tibble: 2 × 3
  Count  mean   med
  <dbl> <dbl> <dbl>
1    -1  1.60 0.224
2     1  4.94 1.02 
# Species specific length-weights based on preyTaxamatch_LW

# Add a and bs for length weight relationships. Here we may have several as and bs at different taxonomic levels (abTaxlevel). We arrane those so that distinct prioritizes e.g. species over order.
Taxonomic_hierarchy <- c("species","family","class", "order", "phylum", NA)
new_db_preyTaxamatch_LW <- preyTaxamatch_LW |>
  dplyr::select(valid_AphiaID,valid_name,kingdom,phylum,class,order,family,genus,db_AphiaID,db_ScientificName,db_CleanScientificName,abTaxlevel,a,b) |>
  mutate(abTaxlevel = factor(abTaxlevel, Taxonomic_hierarchy)) |>
  arrange(db_AphiaID, abTaxlevel) |>
  distinct(db_AphiaID, .keep_all = TRUE)
mutate: converted 'abTaxlevel' from character to factor (0 new NA)
distinct: removed 123 rows (20%), 481 rows remaining
# join in as and bs and taxonomy
new_db2 <- new_db |> # note necessary argument na_matches
  left_join(new_db_preyTaxamatch_LW, join_by("AphiaIDPrey"=="db_AphiaID"), na_matches = "never")
left_join: added 13 columns (valid_AphiaID, valid_name, kingdom, phylum, class, …)
           > rows only in new_db                        0
           > rows only in new_db_preyTaxamatch_LW (   398)
           > matched rows                          65,916
           >                                      ========
           > rows total                            65,916
# we lack a and b for few species when phylum as a taxonomic level is used but when a and b is missing (when abTaxlevel is NA), we use a = 0.01 and b = 3
new_db2 |> 
  summarise(n = n(), mean_a = mean(a, na.rm = TRUE), median_a = median(a, na.rm = TRUE), mean_b = mean(b, na.rm = TRUE), median_b = median(b, na.rm = TRUE), .by = abTaxlevel)
summarise: now 6 rows and 6 columns, ungrouped
# A tibble: 6 × 6
  abTaxlevel     n  mean_a median_a mean_b median_b
  <fct>      <int>   <dbl>    <dbl>  <dbl>    <dbl>
1 phylum     20489 0.0421   0.0422    2.75     2.75
2 order      13739 0.114    0.0704    2.35     2.49
3 <NA>       15820 0.00947  0.00947   3.03     3.03
4 species    13780 0.0156   0.00674   2.98     3.05
5 family      2042 0.00988  0.0107    3.12     3.13
6 class         46 0.0519   0.0529    2.63     2.62
# Prepare variables and calculate weight (total and individual) of prey given count and prey length
new_db3 <- new_db2 |> 
  mutate(prey_length = if_else(UnitLngt == "mm" & prey_length >= 0, prey_length/10, prey_length, missing = NA), # make cm
         prey_weight = replace_na(prey_weight, -9),
         Count = replace_na(Count, -9),
         prey_length = replace_na(prey_length, -9),
         Count = ifelse(Count <= 0 & (prey_length > 0 | prey_weight > 0), 1, Count), # When Count is Na (-9) or 0 and there is a weight or length, replace with 1.
         prey_weight_source = if_else(prey_weight <= 0 & prey_length > 0 & Count > 0, "estimated", "observed", missing = NA), # Note that -9 length are gets "observed", i.e we will not estimate those weights.
         prey_weight = if_else(prey_weight_source == "estimated", 
                               if_else(is.na(a) & is.na(b), (0.01*(prey_length)^3)*Count,  (a*prey_length^b)*Count),
                               prey_weight, missing = -9),
         prey_ind_weight = if_else(prey_weight > 0 & Count > 0, prey_weight / Count, NA)) |>
  dplyr::select(-UnitLngt) # Since this is no longer valid when we changed scale above
mutate: changed 31,129 values (47%) of 'Count' (31,103 fewer NAs)
        changed 13,463 values (20%) of 'prey_weight' (13,463 fewer NAs)
        changed 51,932 values (79%) of 'prey_length' (45,341 fewer NAs)
        new variable 'prey_weight_source' (character) with 2 unique values and 0% NA
        new variable 'prey_ind_weight' (double) with 5,981 unique values and 20% NA
new_db3 |> # some unreasonable weights but not too much
  filter(prey_weight > 0) |>
  ggplot(aes(prey_weight)) +
  geom_histogram() +
  facet_wrap(~abTaxlevel, scales = "free")
filter: removed 13,369 rows (20%), 52,547 rows remaining
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.

# # Even if we estimate the prey weight based on the length of the prey and the number of the prey, we still have 151 rows that Weight == -9 or 0. We can give them the average weight but: While this may produce outliers when the predator is small compared to the avg prey size, these would be removed in a later cleaning stage. Lets gove them average weights.

new_db3 |> # Of course there are extreme weights that should not be included, these will be removed when we filter out unreasonable rpw:s (>1).
  filter(prey_weight > 0 & !is.na(AphiaIDPrey)) |>
  ggplot(aes(prey_ind_weight)) +
  facet_wrap(~abTaxlevel, scales = "free") +
  geom_histogram()
filter: removed 14,558 rows (22%), 51,358 rows remaining
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.

new_db3 |> # 893 g sprat not possible
  filter(AphiaIDPrey %in% c(293743, 1625944, 126425, 322683, 236448),
         prey_ind_weight > 200) |> as.data.frame()
filter: removed 65,914 rows (>99%), 2 rows remaining
  tblUploadID tblHaulID tblPredatorInformationID Ship Gear HaulNo StationNumber
1        8319      6300                    93265 67BC  TVL     27             1
2        8323      6486                    96765 LA99  TVS     12             1
  Year Month Day Time FishID AphiaIDPredator pred_weight Number
1 2008     3  11 0625     35          126436       695.0      1
2 2004     3  14 0520    157          126436       188.5      1
  MeasurementIncrement pred_length Code Age Sex MaturityScale MaturityStage
1                    1          40 <NA>  NA   M            M6            62
2                    1          27 <NA>  NA   M            M6            62
  PreservationMethod Regurgitated StomachFullness FullStomWgt EmptyStomWgt
1                NBF            0              NA          NA           NA
2                NBF            0              NA          NA           NA
  StomachEmpty GenSamp ShootLat ShootLong HaulLat HaulLong ICESrectangle Depth
1            0       N  56.2717   19.6533      NA       NA          41G9    73
2            0       N  56.5833   20.5667      NA       NA          42H0    63
  Survey ICESDatabase Country Reporting_organisation       CruiseID
1   <NA>            Y      LV                   3140 LV314067BC2008
2   <NA>            Y      LV                   3140 LV3140LA992004
  Regurgitated_st tblPreyInformationID AphiaIDPrey IdentMet DigestionStage
1               0               143163      126425     VISO              1
2               0               147493      126425     VISO              1
  GravMethod SubFactor PreySequence Count UnitWgt prey_weight prey_length
1      WETWT        NA            2     1       g         893        10.5
2      WETWT        NA            2     1       g         324        -9.0
  OtherItems OtherCount OtherWgt AnalysingOrg prey_notes valid_AphiaID
1       <NA>         NA       NA         3140       <NA>        126425
2       <NA>         NA       NA         3140       <NA>        126425
         valid_name  kingdom   phylum     class        order    family    genus
1 Sprattus sprattus Animalia Chordata Teleostei Clupeiformes Clupeidae Sprattus
2 Sprattus sprattus Animalia Chordata Teleostei Clupeiformes Clupeidae Sprattus
  db_ScientificName db_CleanScientificName abTaxlevel           a        b
1              <NA>                   <NA>    species 0.006736452 3.050674
2              <NA>                   <NA>    species 0.006736452 3.050674
  prey_weight_source prey_ind_weight
1           observed             893
2           observed             324
new_db3 |> # Saduria above 10 g is highly unlikely
  filter(AphiaIDPrey %in% c(293511, 119034),
         prey_ind_weight > 10) |> as.data.frame() |>
  arrange(prey_ind_weight) |> 
  slice(1:20)
filter: removed 65,300 rows (99%), 616 rows remaining
slice: removed 596 rows (97%), 20 rows remaining
   tblUploadID tblHaulID tblPredatorInformationID Ship Gear HaulNo
1         8321      6380                    94535 67BC  TVL      1
2         8468     19421                   119073 LABS  LBT      3
3         8307      6097                    88591 90MZ  LBT     36
4         8307      6097                    88591 90MZ  LBT     36
5         8470     19473                   121829 LA99  LBT      2
6         8470     19473                   121829 LA99  LBT      2
7         8470     19473                   121829 LA99  LBT      2
8         8321      6382                    94630 67BC  TVL      2
9         8468     19422                   119166 LABS  LBT      4
10        8468     19419                   119007 LABS  LBT      1
11        8468     19422                   119123 LABS  LBT      4
12        8305      5992                    83228 90MZ  LBT     21
13        8305      5992                    83228 90MZ  LBT     21
14        8305      5992                    83228 90MZ  LBT     21
15        8305      5992                    83228 90MZ  LBT     21
16        8305      5992                    83228 90MZ  LBT     21
17        8305      5992                    83228 90MZ  LBT     21
18        8305      5992                    83228 90MZ  LBT     21
19        8468     19423                   119209 LABS  LBT      5
20        8324      6514                    97457 LA99  TVS      9
   StationNumber Year Month Day Time FishID AphiaIDPredator pred_weight Number
1              1 2006     3   5 0955     28          126436        1325      1
2              2 1967     2   9 1435      7          126436         600      1
3              4 1965     7  22 0900     73          126436        1200      1
4              4 1965     7  22 0900     73          126436        1200      1
5              1 1968     1  13 0700      9          126436        1700      1
6              1 1968     1  13 0700      9          126436        1700      1
7              1 1968     1  13 0700      9          126436        1700      1
8              1 2006     3   5 1145      6          126436         830      1
9              2 1967     2   9 1330     47          126436        1013      1
10             2 1967     2   9 0955     54          126436         670      1
11             2 1967     2   9 1330      4          126436         790      1
12             2 1964     4  27 0700      3          126436         800      1
13             2 1964     4  27 0700      3          126436         800      1
14             2 1964     4  27 0700      3          126436         800      1
15             2 1964     4  27 0700      3          126436         800      1
16             2 1964     4  27 0700      3          126436         800      1
17             2 1964     4  27 0700      3          126436         800      1
18             2 1964     4  27 0700      3          126436         800      1
19             2 1967     2  10 0645     20          126436         640      1
20             2 2003    11  22 1050     10          126436        1630      1
   MeasurementIncrement pred_length Code Age Sex MaturityScale MaturityStage
1                     1          50 <NA>  NA   F            M6            62
2                     1          41 <NA>  NA   M            M6            62
3                     1          49 <NA>  NA   F            M6            65
4                     1          49 <NA>  NA   F            M6            65
5                     1          54 <NA>  NA   F            M6            62
6                     1          54 <NA>  NA   F            M6            62
7                     1          54 <NA>  NA   F            M6            62
8                     1          43 <NA>  NA   M            M6            62
9                     1          48 <NA>  NA   F            M6            65
10                    1          44 <NA>  NA   F            M6            62
11                    1          44 <NA>  NA   F            M6            62
12                    1          42 <NA>  NA   F            M6            62
13                    1          42 <NA>  NA   F            M6            62
14                    1          42 <NA>  NA   F            M6            62
15                    1          42 <NA>  NA   F            M6            62
16                    1          42 <NA>  NA   F            M6            62
17                    1          42 <NA>  NA   F            M6            62
18                    1          42 <NA>  NA   F            M6            62
19                    1          40 <NA>  NA   F            M6            62
20                    1          58 <NA>  NA   F            M6            64
   PreservationMethod Regurgitated StomachFullness FullStomWgt EmptyStomWgt
1                 NBF            0              NA          NA           NA
2                 NBF            0              NA          NA           NA
3                 NBF            0              NA          NA           NA
4                 NBF            0              NA          NA           NA
5                 NBF            0              NA          NA           NA
6                 NBF            0              NA          NA           NA
7                 NBF            0              NA          NA           NA
8                 NBF            0              NA          NA           NA
9                 NBF            0              NA          NA           NA
10                NBF            0              NA          NA           NA
11                NBF            0              NA          NA           NA
12                NBF            0              NA          NA           NA
13                NBF            0              NA          NA           NA
14                NBF            0              NA          NA           NA
15                NBF            0              NA          NA           NA
16                NBF            0              NA          NA           NA
17                NBF            0              NA          NA           NA
18                NBF            0              NA          NA           NA
19                NBF            0              NA          NA           NA
20                NBF            0              NA          NA           NA
   StomachEmpty GenSamp ShootLat ShootLong HaulLat HaulLong ICESrectangle Depth
1             0       N  55.7117   19.9983      NA       NA          40G9    74
2             0       N  55.9833   18.5333      NA       NA          40G8   110
3             0       N  56.4166   20.0833      NA       NA          41H0    80
4             0       N  56.4166   20.0833      NA       NA          41H0    80
5             0       N  56.5833   20.7500      NA       NA          42H0    40
6             0       N  56.5833   20.7500      NA       NA          42H0    40
7             0       N  56.5833   20.7500      NA       NA          42H0    40
8             0       N  55.6766   20.0883      NA       NA          40H0    76
9             0       N  55.7500   18.5833      NA       NA          40G8   110
10            0       N  55.8666   18.9499      NA       NA          40G8    97
11            0       N  55.7500   18.5833      NA       NA          40G8   110
12            0       N  55.5833   20.2500      NA       NA          40H0    74
13            0       N  55.5833   20.2500      NA       NA          40H0    74
14            0       N  55.5833   20.2500      NA       NA          40H0    74
15            0       N  55.5833   20.2500      NA       NA          40H0    74
16            0       N  55.5833   20.2500      NA       NA          40H0    74
17            0       N  55.5833   20.2500      NA       NA          40H0    74
18            0       N  55.5833   20.2500      NA       NA          40H0    74
19            0       N  55.7000   20.3000      NA       NA          40H0    75
20            0       N  55.8667   20.0833      NA       NA          40H0    60
   Survey ICESDatabase Country Reporting_organisation       CruiseID
1    <NA>            Y      LV                   3140 LV314067BC2006
2    <NA>            N      LV                   3140 LV3140LABS1967
3    <NA>            N      LV                   3140 LV314090MZ1965
4    <NA>            N      LV                   3140 LV314090MZ1965
5    <NA>            N      LV                   3140 LV3140LA991968
6    <NA>            N      LV                   3140 LV3140LA991968
7    <NA>            N      LV                   3140 LV3140LA991968
8    <NA>            Y      LV                   3140 LV314067BC2006
9    <NA>            N      LV                   3140 LV3140LABS1967
10   <NA>            N      LV                   3140 LV3140LABS1967
11   <NA>            N      LV                   3140 LV3140LABS1967
12   <NA>            N      LV                   3140 LV314090MZ1964
13   <NA>            N      LV                   3140 LV314090MZ1964
14   <NA>            N      LV                   3140 LV314090MZ1964
15   <NA>            N      LV                   3140 LV314090MZ1964
16   <NA>            N      LV                   3140 LV314090MZ1964
17   <NA>            N      LV                   3140 LV314090MZ1964
18   <NA>            N      LV                   3140 LV314090MZ1964
19   <NA>            N      LV                   3140 LV3140LABS1967
20   <NA>            Y      LV                   3140 LV3140LA992003
   Regurgitated_st tblPreyInformationID AphiaIDPrey IdentMet DigestionStage
1                0               144545      119034     VISO              1
2                0               195122      119034     VISO              1
3                0               136833      119034     VISO              1
4                0               136834      119034     VISO              1
5                0               198804      119034     VISO              1
6                0               198805      119034     VISO              1
7                0               198806      119034     VISO              1
8                0               144686      119034     VISO              1
9                0               195014      119034     VISO              1
10               0               194924      119034     VISO              1
11               0               194977      119034     VISO              1
12               0               123124      119034     VISO              1
13               0               123125      119034     VISO              1
14               0               123126      119034     VISO              1
15               0               123127      119034     VISO              1
16               0               123128      119034     VISO              1
17               0               123129      119034     VISO              1
18               0               123130      119034     VISO              1
19               0               195175      119034     VISO              1
20               0               148288      119034     VISO              1
   GravMethod SubFactor PreySequence Count UnitWgt prey_weight prey_length
1       WETWT        NA            1     1       g       10.01        -9.0
2       WETWT        NA            1     1       g       10.04        -9.0
3       WETWT        NA            1     1       g       10.10         6.0
4       WETWT        NA            2     1       g       10.10         5.9
5       WETWT        NA            2     1       g       10.10         5.6
6       WETWT        NA            3     1       g       10.10         3.2
7       WETWT        NA            4     1       g       10.10         2.9
8       WETWT        NA            2     1       g       10.15        -9.0
9       WETWT        NA            1     1       g       10.19        -9.0
10      WETWT        NA            1     1       g       10.20        -9.0
11      WETWT        NA            1     1       g       10.20        -9.0
12      WETWT        NA            3     1       g       10.34         5.6
13      WETWT        NA            4     1       g       10.34         3.9
14      WETWT        NA            5     1       g       10.34         3.8
15      WETWT        NA            6     1       g       10.34         4.0
16      WETWT        NA            7     1       g       10.34         4.0
17      WETWT        NA            8     1       g       10.34         4.2
18      WETWT        NA            9     1       g       10.34         4.6
19      WETWT        NA            1     1       g       10.40        -9.0
20      WETWT        NA            3     1       g       10.44         6.3
   OtherItems OtherCount OtherWgt AnalysingOrg prey_notes valid_AphiaID
1        <NA>         NA       NA         3140       <NA>        119034
2        <NA>         NA       NA         3140       <NA>        119034
3        <NA>         NA       NA         3140       <NA>        119034
4        <NA>         NA       NA         3140       <NA>        119034
5        <NA>         NA       NA         3140       <NA>        119034
6        <NA>         NA       NA         3140       <NA>        119034
7        <NA>         NA       NA         3140       <NA>        119034
8        <NA>         NA       NA         3140       <NA>        119034
9        <NA>         NA       NA         3140       <NA>        119034
10       <NA>         NA       NA         3140       <NA>        119034
11       <NA>         NA       NA         3140       <NA>        119034
12       <NA>         NA       NA         3140       <NA>        119034
13       <NA>         NA       NA         3140       <NA>        119034
14       <NA>         NA       NA         3140       <NA>        119034
15       <NA>         NA       NA         3140       <NA>        119034
16       <NA>         NA       NA         3140       <NA>        119034
17       <NA>         NA       NA         3140       <NA>        119034
18       <NA>         NA       NA         3140       <NA>        119034
19       <NA>         NA       NA         3140       <NA>        119034
20       <NA>         NA       NA         3140       <NA>        119034
        valid_name  kingdom     phylum        class   order       family
1  Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
2  Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
3  Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
4  Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
5  Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
6  Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
7  Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
8  Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
9  Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
10 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
11 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
12 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
13 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
14 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
15 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
16 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
17 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
18 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
19 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
20 Saduria entomon Animalia Arthropoda Malacostraca Isopoda Chaetiliidae
     genus db_ScientificName db_CleanScientificName abTaxlevel         a
1  Saduria              <NA>                   <NA>      order 0.1693124
2  Saduria              <NA>                   <NA>      order 0.1693124
3  Saduria              <NA>                   <NA>      order 0.1693124
4  Saduria              <NA>                   <NA>      order 0.1693124
5  Saduria              <NA>                   <NA>      order 0.1693124
6  Saduria              <NA>                   <NA>      order 0.1693124
7  Saduria              <NA>                   <NA>      order 0.1693124
8  Saduria              <NA>                   <NA>      order 0.1693124
9  Saduria              <NA>                   <NA>      order 0.1693124
10 Saduria              <NA>                   <NA>      order 0.1693124
11 Saduria              <NA>                   <NA>      order 0.1693124
12 Saduria              <NA>                   <NA>      order 0.1693124
13 Saduria              <NA>                   <NA>      order 0.1693124
14 Saduria              <NA>                   <NA>      order 0.1693124
15 Saduria              <NA>                   <NA>      order 0.1693124
16 Saduria              <NA>                   <NA>      order 0.1693124
17 Saduria              <NA>                   <NA>      order 0.1693124
18 Saduria              <NA>                   <NA>      order 0.1693124
19 Saduria              <NA>                   <NA>      order 0.1693124
20 Saduria              <NA>                   <NA>      order 0.1693124
          b prey_weight_source prey_ind_weight
1  2.149667           observed           10.01
2  2.149667           observed           10.04
3  2.149667           observed           10.10
4  2.149667           observed           10.10
5  2.149667           observed           10.10
6  2.149667           observed           10.10
7  2.149667           observed           10.10
8  2.149667           observed           10.15
9  2.149667           observed           10.19
10 2.149667           observed           10.20
11 2.149667           observed           10.20
12 2.149667           observed           10.34
13 2.149667           observed           10.34
14 2.149667           observed           10.34
15 2.149667           observed           10.34
16 2.149667           observed           10.34
17 2.149667           observed           10.34
18 2.149667           observed           10.34
19 2.149667           observed           10.40
20 2.149667           observed           10.44
# Saduria has a mean weight of 1 g. max weight of 7,8 or 9. The count may be off etc. If analysing saduria individuals level, we need to revisit these values (https://doi.org/10.1111/j.0021-8790.2004.00800.x and https://www.jstor.org/stable/24831823 ). Not that outliers likely disappear when I filter on relative prey weight (<1).

# To control for outliers affecting average of individual weight, remove values outside quartiles
prey_avg_ind_weight <- new_db3 |>
  filter(!is.na(AphiaIDPrey)) |>
  group_by(AphiaIDPrey) |>
  filter(!prey_ind_weight %in% boxplot.stats(prey_ind_weight)$out) |>
  reframe(avg_weight = mean(prey_ind_weight, na.rm = TRUE)) |>
  ungroup()
filter: removed 13,929 rows (21%), 51,987 rows remaining
group_by: one grouping variable (AphiaIDPrey)
filter (grouped): removed 4,910 rows (9%), 47,077 rows remaining (removed 0 groups, 82 groups remaining)
ungroup: no grouping variables remain
prey_avg_ind_weight |> filter(is.na(avg_weight))
filter: removed 79 rows (96%), 3 rows remaining
# A tibble: 3 × 2
  AphiaIDPrey avg_weight
        <dbl>      <dbl>
1      104152        NaN
2      104241        NaN
3      234024        NaN
# This should be the average prey weight which we can use to calculate the weight of these prey if we have the counts. Left join that summarized data and do the estimate of weight based on length. But first figure out which unit prey size is

# Join average weight and estimate weight
new_db4 <- new_db3 |>
  left_join(prey_avg_ind_weight) |>
  mutate(prey_weight = if_else(prey_weight <= 0 & Count > 0 & !is.na(AphiaIDPrey), Count * avg_weight, prey_weight, missing = -9),
         prey_weight = replace_na(prey_weight, 0)) # New name and remove NaNs created when estimating avg weight excluding outliers (mean() creates NaNs).
Joining with `by = join_by(AphiaIDPrey)`
left_join: added one column (avg_weight)
> rows only in new_db3 13,929
> rows only in prey_avg_ind_weight ( 0)
> matched rows 51,987
> ========
> rows total 65,916
mutate: changed 56 values (<1%) of 'prey_weight' (0 new NAs)
# Those where prey weight is missing is duie to missing lengths or counts.
new_db4 |> filter(prey_weight <= 0) |> as.data.frame() |> distinct(prey_weight, prey_length, Count)
filter: removed 52,600 rows (80%), 13,316 rows remaining
distinct: removed 13,310 rows (>99%), 6 rows remaining
  prey_weight prey_length Count
1          -9          -9    -9
2           0          -9     1
3           0          -9     3
4           0          -9     2
5          -9          -9     1
6          -9          -9     3
# Remove these (nearly 12000 rows, which are mainly the empties but also 270 na count/weight/length in prey.
new_db4 <- new_db4 |>
  filter(!(prey_weight <= 0 & Count <= 0 & prey_length <= 0), 
         !is.na(prey_weight)) # three rows
filter: removed 13,310 rows (20%), 52,606 rows remaining

Summarise prey weights by predator and prey group

# Make prey groups based on taxonomy. Groups: herring, sprat, saduria, chordata, invertebrates, other 
# clupeids are key prey and should be either herring or sprat
new_db4 |> # We split the non idd clupeids into sprat and herring in the next chunk.
  filter(family == "Clupeidae") |> #%in% c("Clupea harengus", "Sprattus sprattus")) |>
  summarise(n = n(), .by= c(valid_name))
filter: removed 42,925 rows (82%), 9,681 rows remaining
summarise: now 4 rows and 2 columns, ungrouped
# A tibble: 4 × 2
  valid_name            n
  <chr>             <int>
1 Clupeidae           332
2 Sprattus sprattus  7000
3 Clupea harengus    2284
4 Clupea               65
# Saduria are also key prey, but... 
new_db4 |> # No non idd Isopods 
  filter(order == "Isopoda") |> #%in% c("Clupea harengus", "Sprattus sprattus")) |>
  summarise(n = n(), .by= c(valid_name))
filter: removed 46,213 rows (88%), 6,393 rows remaining
summarise: now 3 rows and 2 columns, ungrouped
# A tibble: 3 × 2
  valid_name          n
  <chr>           <int>
1 Saduria entomon  6386
2 Idotea balthica     6
3 Isopoda             1
# make the groups
new_db5 <- new_db4 |> # arguments are evaluated in order so proceed from specific to general
  mutate(prey_group = case_when(valid_name == "Clupea harengus"~ "herring",
                                valid_name == "Sprattus sprattus"~ "sprat",
                                valid_name == "Saduria entomon" ~ "saduria",
                                family == "Clupeidae" ~ "unidd_clupeids", # temporary, see below
                                #is.na(valid_name) ~ "other",
                                phylum == "Chordata"~ "other_chords",
                                kingdom == "Animalia" & !phylum == "Chordata" ~ "other_inverts",
                                .default = "other"))
mutate: new variable 'prey_group' (character) with 7 unique values and 0% NA
# seems to work
new_db5 |>
  summarise(n = n(), .by= prey_group)
summarise: now 7 rows and 2 columns, ungrouped
# A tibble: 7 × 2
  prey_group         n
  <chr>          <int>
1 other_inverts  30273
2 saduria         6386
3 other_chords    5073
4 unidd_clupeids   397
5 sprat           7000
6 herring         2284
7 other           1193
new_db5 |> # only NA prey ids and Fucus in the other group
  filter(prey_group == "other") |>
  summarise(n = n(), .by = AphiaIDPrey)
filter: removed 51,413 rows (98%), 1,193 rows remaining
summarise: now 2 rows and 2 columns, ungrouped
# A tibble: 2 × 2
  AphiaIDPrey     n
        <dbl> <int>
1      144129     1
2          NA  1192
# The NAs
new_db5 |> 
  filter(is.na(prey_group))
filter: removed all rows (100%)
# A tibble: 0 × 74
# ℹ 74 variables: tblUploadID <dbl>, tblHaulID <dbl>,
#   tblPredatorInformationID <dbl>, Ship <chr>, Gear <chr>, HaulNo <dbl>,
#   StationNumber <dbl>, Year <dbl>, Month <dbl>, Day <dbl>, Time <chr>,
#   FishID <dbl>, AphiaIDPredator <dbl>, pred_weight <dbl>, Number <dbl>,
#   MeasurementIncrement <dbl>, pred_length <dbl>, Code <chr>, Age <dbl>,
#   Sex <chr>, MaturityScale <chr>, MaturityStage <dbl>,
#   PreservationMethod <chr>, Regurgitated <dbl>, StomachFullness <lgl>, …
# only fish in other Chordates 
new_db5 |>  
  filter(prey_group %in% c("other_chords")) |>
  summarise(n= n(), .by = valid_name)
filter: removed 47,533 rows (90%), 5,073 rows remaining
summarise: now 21 rows and 2 columns, ungrouped
# A tibble: 21 × 2
   valid_name                     n
   <chr>                      <int>
 1 Pomatoschistus minutus       885
 2 Cottus gobio                   6
 3 Gadus morhua                 126
 4 Ammodytes tobianus            18
 5 Enchelyopus cimbrius         121
 6 Pomatoschistus                66
 7 Gasterosteus aculeatus       492
 8 Osmerus eperlanus             52
 9 Myoxocephalus quadricornis     1
10 Platichthys flesus            16
# ℹ 11 more rows
# only invert phyla
new_db5 |>  
  filter(prey_group %in% c("other_inverts")) |>
  summarise(n= n(), .by = phylum)
filter: removed 22,333 rows (42%), 30,273 rows remaining
summarise: now 6 rows and 2 columns, ungrouped
# A tibble: 6 × 2
  phylum         n
  <chr>      <int>
1 Arthropoda 24559
2 Annelida    5424
3 Priapulida   185
4 Mollusca      92
5 Nematoda      12
6 Ctenophora     1

With these estimates of weight based on either length and in a very few cases average weight of that prey, we calculate the total weight of these prey per individual predator stomach, and then pivot wider.

# make predator individual summary of weight
new_pg <- new_db5 |> 
  summarise(prey_tot_weight = sum(prey_weight), .by = c(tblPredatorInformationID, prey_group)) |>
  filter(prey_tot_weight > 0)
summarise: now 32,441 rows and 3 columns, ungrouped
filter: removed 4 rows (<1%), 32,437 rows remaining
# No NAs which seems good
sum(is.na(new_pg$prey_group))
[1] 0
sum(is.na(new_pg$prey_tot_weight))
[1] 0
# change to wide format to get predatorID unique rows.
new_pg <- new_pg |>
  pivot_wider(names_from = "prey_group", values_from = "prey_tot_weight", values_fill = 0)
pivot_wider: reorganized (prey_group, prey_tot_weight) into (other_inverts,
saduria, other_chords, unidd_clupeids, sprat, …) [was 32437x3, now 25740x8]
# no duplicates of predator ID.
new_pg |> group_by(tblPredatorInformationID) |> filter(n()>1)
group_by: one grouping variable (tblPredatorInformationID)
filter (grouped): removed all rows (100%)
# A tibble: 0 × 8
# Groups:   tblPredatorInformationID [0]
# ℹ 8 variables: tblPredatorInformationID <dbl>, other_inverts <dbl>,
#   saduria <dbl>, other_chords <dbl>, unidd_clupeids <dbl>, sprat <dbl>,
#   herring <dbl>, other <dbl>
# Split unidentified clupeids on sprat and herring  
new_pg <- new_pg |>
  mutate(sprat = sprat + unidd_clupeids*0.5,
         herring = herring + unidd_clupeids*0.5) |>
  dplyr::select(!unidd_clupeids)
mutate: changed 332 values (1%) of 'sprat' (0 new NAs)
        changed 332 values (1%) of 'herring' (0 new NAs)

Next I will left_join in the remaining predator information, and after that bind_rows “empty stomachs” (with respect to these 3 prey species). Since the IDs are not overlapping, it doesn’t matter that I already have some 0’s here for some species

new_pg_pred <- new_pg |>
  left_join(pred |> rename(pred_length = Length, 
                           pred_weight = IndWgt), by = "tblPredatorInformationID")
rename: renamed 2 variables (pred_weight, pred_length)
left_join: added 40 columns (tblUploadID, tblHaulID, Ship, Gear, HaulNo, …)
           > rows only in new_pg                          0
           > rows only in rename(pred, pred_lengt.. (13,537)
           > matched rows                            25,740
           >                                        ========
           > rows total                              25,740
# two empty stomachs
new_pg_pred |>
  rowwise() |>
  filter(sum(saduria, other_inverts, other_chords, sprat, herring, other) == 0) |>
  ungroup()
filter: removed all rows (100%)
ungroup: no grouping variables remain
# A tibble: 0 × 47
# ℹ 47 variables: tblPredatorInformationID <dbl>, other_inverts <dbl>,
#   saduria <dbl>, other_chords <dbl>, sprat <dbl>, herring <dbl>, other <dbl>,
#   tblUploadID <dbl>, tblHaulID <dbl>, Ship <chr>, Gear <chr>, HaulNo <dbl>,
#   StationNumber <dbl>, Year <dbl>, Month <dbl>, Day <dbl>, Time <chr>,
#   FishID <dbl>, AphiaIDPredator <dbl>, pred_weight <dbl>, Number <dbl>,
#   MeasurementIncrement <dbl>, pred_length <dbl>, Code <chr>, Age <dbl>,
#   Sex <chr>, MaturityScale <chr>, MaturityStage <dbl>, …

Now add in the “empty stomachs” from pred3 using bind_rows. When I bind_rows, the columns that are not matching get NA. The only column not matching should be the average weight columns. They will get NA, and I’ll change it to 0.

Bind in empty stomachs

# The empty stomachs to be added are those that are empty in predator data. For data before 2005 we don't know whether they are regurgitated or empty because of not eating. For an analysis using all data (1963-) we need to keep that in mind.
# Because of the issue of not getting prey weight even though they are present above, we need to make sure to drop these stomachs in the full data set also before joining, so that we don't treat them as empty!

# StomachEmpty = 1 is an empty stomach even though this is not an mandatory parameter (?). Since regurgitated==1 and NA have been removed, all empty stomachs seem to be truly empty, i.e. there are no tblpredatorinformationIDs with stomach empty that are in the prey data (No overlap in stomachs between the predator and prey data sets).
empty_stom <- pred3 |>
  filter(StomachEmpty == 1) |>
  filter(!tblPredatorInformationID %in% unique(new_pg$tblPredatorInformationID))
filter: removed 28,414 rows (74%), 9,960 rows remaining
filter: no rows removed
# Bind rows!
new_dat <- new_pg_pred |>
  bind_rows(empty_stom)

# Added NAs with empty stomachs, replace with 0
new_dat2 <- new_dat |> 
  mutate(other = replace_na(other, 0),
         sprat = replace_na(sprat, 0),
         herring = replace_na(herring, 0),
         saduria = replace_na(saduria, 0),
         other_inverts = replace_na(other_inverts, 0),
         other_chords = replace_na(other_chords, 0))
mutate: changed 9,960 values (28%) of 'other_inverts' (9,960 fewer NAs)
        changed 9,960 values (28%) of 'saduria' (9,960 fewer NAs)
        changed 9,960 values (28%) of 'other_chords' (9,960 fewer NAs)
        changed 9,960 values (28%) of 'sprat' (9,960 fewer NAs)
        changed 9,960 values (28%) of 'herring' (9,960 fewer NAs)
        changed 9,960 values (28%) of 'other' (9,960 fewer NAs)
# Fix that Denmark report in kg
new_dat2 |>
  ggplot(aes(pred_weight, fill = Country)) + 
  facet_wrap(~Country, scales = "free") + 
  geom_histogram() +
  guides( fill = "none")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 687 rows containing non-finite outside the scale range
(`stat_bin()`).
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.

new_dat2 <- new_dat2 |> 
  mutate(pred_weight = ifelse(Country %in% c("DK"), pred_weight*1000, pred_weight))
mutate: changed 2,286 values (6%) of 'pred_weight' (0 new NAs)
# there are negative pred weights from LV in the 1980s. We deal with them when we estimate weights (part 7.). IF we remove them, we see that cods in the data was small in the 1980.
new_dat2 |>
  filter(pred_weight > 0) |>
  mutate(decade = round(Year/10) * 10) |>
  summarise(mean_predw = mean(pred_weight, na.rm = TRUE), n = n(), .by = c(decade, Country))
filter: removed 879 rows (2%), 34,821 rows remaining
mutate: new variable 'decade' (double) with 7 unique values and 0% NA
summarise: now 11 rows and 4 columns, ungrouped
# A tibble: 11 × 4
   decade Country mean_predw     n
    <dbl> <chr>        <dbl> <int>
 1   1970 LV           281.   9112
 2   1980 LV            12.4  4183
 3   1960 LV           470.   7042
 4   1990 LV          1076.    537
 5   2000 LV           501.   5652
 6   2010 LV           404.   4653
 7   2020 SE           298.    478
 8   2020 PL           408.    654
 9   2010 DK           635.    494
10   2020 DK           486.   1792
11   2020 DE           162.    224
# but their distribution seems good. 
new_dat2 |>
  filter(Year > 1979 & Year < 1990 & Country == "LV",
         pred_weight > 0) |>
  ggplot(aes(pred_weight)) + 
  geom_histogram() +
  guides( fill = "none") +
  labs(title = "LV 1980s")
filter: removed 33,930 rows (95%), 1,770 rows remaining
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.

# Also, note that most historical data in the NDB are from LV
new_dat2 |> 
  mutate(decade = round(Year/10) * 10) |>
  ggplot(aes(pred_weight, color = factor(decade))) + 
  facet_wrap(~Country, scales = "free") +
  geom_freqpoly(binwidth=100) +
  guides( fill = "none")
mutate: new variable 'decade' (double) with 7 unique values and 0% NA
Warning: Removed 687 rows containing non-finite outside the scale range (`stat_bin()`).
The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.

# Sweden report in mm
new_dat2 |> 
  ggplot(aes(pred_length, fill = Country)) + 
  facet_wrap(~Country, scales = "free") +
  geom_histogram() +
  guides( fill = "none")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.

new_dat2 <- new_dat2 |> 
  mutate(pred_length = ifelse(Country == "SE", pred_length/10, pred_length))
mutate: changed 478 values (1%) of 'pred_length' (0 new NAs)
# Remove non cod predators (e.g. Whiting 126438)
new_data <- new_dat2 |> filter(AphiaIDPredator %in% c(126436))
filter: removed 224 rows (1%), 35,476 rows remaining
new_data |>
  summarise(count = n(), .by = c(Year, Country)) |>
  ggplot() +
  geom_bar(aes(Year, count, fill = Country), alpha = 1, stat="identity", position = "stack") +
  scale_x_continuous(n.breaks = 10) + ylab("# stomachs") +
  labs(title="New data base")
summarise: now 50 rows and 3 columns, ungrouped
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.

4. Old database data

The data in the old ICES database contains much more data than what is found in the new version of the database but is in poor shape containing many errors and unique identiifers are missing. Below, identifiers for hauls and predators are made and the old db is cleaned.

Make unique identifier

glimpse(ODB)
Rows: 574,684
Columns: 52
$ Dataset                     <chr> "Stomach data: Latvian historic", "Stomach…
$ RecordType                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Country                     <chr> "Latvia", "Latvia", "Latvia", "Latvia", "L…
$ Ship                        <chr> "MAZ", "UNKN", "UNKN", "UNKN", "UNKN", "ZB…
$ Latitude                    <dbl> 54.25, 54.25, 54.25, 54.25, 54.25, 54.25, …
$ Longitude                   <dbl> 14.5, 15.5, 15.5, 15.5, 15.5, 15.5, 15.5, …
$ Estimated_Lat_Lon           <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ ICES_StatRec                <chr> "37G4", "37G5", "37G5", "37G5", "37G5", "3…
$ ICES_AreaCode               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Year                        <dbl> 1968, 1973, 1973, 1973, 1973, 1983, 1983, …
$ Month                       <chr> "9", "5", "5", "5", "5", "12", "12", "1", …
$ Day                         <chr> "19", "11", "11", "11", "11", "16", "16", …
$ Time                        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Station                     <chr> "1", "15", "15", "15", "15", "7", "7", "30…
$ Haul                        <chr> "1", "15", "15", "15", "15", "7", "7", "30…
$ Sampling_Method             <chr> "Demersal sampling", "Demersal sampling", …
$ Depth                       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Temperature                 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ `SampleNo(FishID)`          <dbl> 52, 100, 69, 74, 97, 4, 61, 11, 18, 26, 30…
$ ICES_SampleID               <dbl> 2628171, 2528147, 2528139, 2528140, 252814…
$ Predator_AphiaID            <dbl> 126436, 126436, 126436, 126436, 126436, 12…
$ Predator_LatinName          <chr> "Gadus morhua", "Gadus morhua", "Gadus mor…
$ `Predator_Weight(mean)`     <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ `Predator_Age(mean)`        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ `Predator_Lengh(mean)`      <dbl> 25, 17, 17, 18, 16, 60, 77, 18, 25, 19, 19…
$ Predator_LowerLengthBound   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Predator_UpperLengthBound   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Predator_CPUE               <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ `GallBladder_stage(class)`  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Stomach_METFP               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Stomach_TotalNo             <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Stomach_WithFood            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ Stomach_Regurgitated        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ Stomach_WithSkeletalRemains <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ Stomach_Empty               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ Stomach_ContentWgt          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Stomach_EmptyWgt            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Stomach_fullness            <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Stomach_Item                <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ ICES_ItemID                 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_AphiaID                <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_LatinName              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_IdentMet               <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_DigestionStage         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_TotalNo                <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_Weight                 <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_LengthIdentifier       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_Length                 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_LowerLengthBound       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_UpperLengthBound       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Prey_MinNo                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ Remarks                     <chr> "Reported Stomach_Item: ; Repor", "Reporte…
# Each row in old data contains a prey item, i.e. we have many rows for each predator and there is no unique predator ID. We need to generate an identifier so that we can compare observations the old and new data and remove overlapping data. The ICESsampleID is for the predator I assume, ICESitemID are prey items but there are duplicates, possibly mainly due to the two data sets (Year of stomach and Stomach tender using the same IDs which become duplicates?).  

# By creating an ID with Date (Year, Month and Day), Country and Haul (HaulNo (new data) and Haul (old data)) we can identify the data missing from the new database. Ship info in the old data is unclean, by instead assuming that Country should be unique to date and haul (sort of Country as a proxy for Ship). However, there are 41506 rows where country and ship is missing. VT removes these. 
#length(is.na(old_db[which(is.na(old_db$Country)),]$Ship))

old_db <- janitor::remove_empty(ODB, which = "cols") # remove cols with only Nas to increase readability

# fix country names so that they equal the other datasets
unique(old_db$Country)
 [1] "Latvia"          "Denmark"         "Poland"          "Sweden"         
 [5] NA                "France"          "Germany"         "Norway"         
 [9] "The Netherlands" "United Kingdom"  "Belgium"         "Ireland"        
old_db <- old_db |>
  mutate(Country = case_when(Country == "Latvia" ~ "LV",
                             Country == "Denmark" ~ "DK",
                             Country == "Poland" ~ "PL",
                             Country == "Sweden" ~ "SE",
                             Country == "Germany" ~ "DE",
                             is.na(Country) ~ NA,
                             .default = "NotBaltic"))
mutate: changed 533,178 values (93%) of 'Country' (0 new NAs)
old_db %>%
  filter(Country %in% c("NotBaltic",NA)) %>%
  count(Country)
filter: removed 392,826 rows (68%), 181,858 rows remaining
count: now 2 rows and 2 columns, ungrouped
# A tibble: 2 × 2
  Country        n
  <chr>      <int>
1 NotBaltic 140352
2 <NA>       41506
sum(is.na(old_db$Haul))
[1] 50317
sum(is.na(old_db$ICES_StatRec))
[1] 0
# we get rid of almost all NAs (198 are left) in Haul by excluding based on lat lon (i.e. outside the Baltic making Country not necessary)
# filter and create haul identifier 
old_db <- old_db |> 
  rename(HaulNo = Haul,
         ICESrectangle = ICES_StatRec) |> 
  filter(# remove non cods
         Predator_AphiaID %in% c(126436),
         # remove data outside the Baltic (this removed all Country == NA)
         between(Latitude, 53, 60) & between(Longitude, 12.5, 24)) |> 
  mutate(Haul_ID = paste(Country, Year, Month, Day, HaulNo, ICESrectangle, sep = "_"))
rename: renamed 2 variables (ICESrectangle, HaulNo)
filter: removed 207,088 rows (36%), 367,596 rows remaining
mutate: new variable 'Haul_ID' (character) with 2,649 unique values and 0% NA
old_db %>%
  filter(Country %in% c("NotBaltic",NA)) %>%
  count(Country)
filter: removed all rows (100%)
count: now 0 rows and 2 columns, ungrouped
# A tibble: 0 × 2
# ℹ 2 variables: Country <chr>, n <int>
# join hi and fi from new db to get columns for identifier
# add Country and ICESrectangle to new db prey data from haul data and create identifier
prey_temp <- prey |>
  left_join(dplyr::select(hifi, tblUploadID, Country, ICESrectangle), by = "tblUploadID", multiple = "any") |>
  mutate(Haul_ID = paste(Country, Year, Month, Day, HaulNo, ICESrectangle, sep = "_"))
left_join: added 2 columns (Country, ICESrectangle)
           > rows only in prey                            0
           > rows only in dplyr::select(hifi, tbl.. (    59)
           > matched rows                            54,235
           >                                        ========
           > rows total                              54,235
mutate: new variable 'Haul_ID' (character) with 1,589 unique values and 0% NA
# identify cod in hauls in the Baltic not present in the new data. I.e. what the old db is adding to the new db. For the January 2024 ICES db download, this removes 23 487 rows.
missing_db <- old_db |>
  anti_join(prey_temp, by = "Haul_ID") # return all rows from x (old_d) without a match in y (prey)
anti_join: added no columns
           > rows only in old_db     344,109
           > rows only in prey_temp ( 48,707)
           > matched rows           ( 23,487)
           >                        =========
           > rows total              344,109

Correct coordinates where possible

Below we update ICES rectangle center based coordinates in the ODB with coordinates from matching BITS surveys.

# # few data points have estimated the coordinates according to Estimated_Lat_Lon
sum(old_db$Estimated_Lat_Lon == "Yes", na.rm = TRUE)/nrow(old_db) 
[1] 0.0001904264
missing_db |> # But its likely all ODB data as the lat lon of the rectangle midpoint equals the lat lon columns
  filter(ices.rect(ICESrectangle)$lon == Longitude & ices.rect(ICESrectangle)$lat == Latitude) |>
  count()
filter: no rows removed
count: now one row and one column, ungrouped
# A tibble: 1 × 1
       n
   <int>
1 344109
length(unique(missing_db$Longitude)) # only 11 unique lats 
[1] 11
missing_db |> # and 60 unique coords in the old db
  dplyr::select(Latitude, Longitude) |> 
  distinct()
distinct: removed 344,049 rows (>99%), 60 rows remaining
# A tibble: 60 × 2
   Latitude Longitude
      <dbl>     <dbl>
 1     54.2      14.5
 2     54.2      15.5
 3     54.2      19.5
 4     54.8      14.5
 5     54.8      15.5
 6     54.8      19.5
 7     55.2      15.5
 8     55.2      16.5
 9     55.2      17.5
10     55.2      18.5
# ℹ 50 more rows
# Make an identifier
intersect(tolower(colnames(missing_db)), tolower(colnames(bits_hh)))
[1] "country" "ship"    "year"    "month"   "day"     "haulno"  "depth"  
sum(is.na(missing_db$HaulNo)) # a few rows lacks haul number.
[1] 198
missing_db2 <- missing_db |> 
  mutate(Hid = paste(Country, Year, Month, Day, HaulNo, ICESrectangle, sep = "_")) #
mutate: new variable 'Hid' (character) with 2,441 unique values and 0% NA
bits_hh <- bits_hh |> 
  mutate(Hid = paste(Country, Year, Month, Day, HaulNo, StatRec, sep = "_"))
mutate: new variable 'Hid' (character) with 18,105 unique values and 0% NA
length(intersect(missing_db2$Hid, bits_hh$Hid) ) # 553 unique hauls that are matching based on Country, Year, Month, Day, ices_rect
[1] 553
missing_db2 |> # We should have done better in the current millenium... 
  filter(!Hid %in% unique(bits_hh$Hid)) |>
  mutate(decade = round(Year/10) * 10) |>
  group_by(decade, Country) |>
  count() |>
  ungroup()
filter: removed 29,670 rows (9%), 314,439 rows remaining
mutate: new variable 'decade' (double) with 6 unique values and 0% NA
group_by: 2 grouping variables (decade, Country)
count: now 10 rows and 3 columns, 2 group variables remaining (decade, Country)
ungroup: no grouping variables remain
# A tibble: 10 × 3
   decade Country      n
    <dbl> <chr>    <int>
 1   1960 LV       28780
 2   1970 LV      102072
 3   1980 LV      120830
 4   1990 DK          70
 5   1990 LV       45164
 6   2000 LV          85
 7   2010 DK       11708
 8   2010 LV         147
 9   2010 PL        2798
10   2010 SE        2785
# Now I add lat/lon values from bits_hh to replace those in mm
missing_db3 <- missing_db2 |> 
  left_join(bits_hh |> dplyr::select(ShootLat, ShootLong, Hid)) |>
  mutate(lat = ifelse(is.na(ShootLat), Latitude, ShootLat),
         lon = ifelse(is.na(ShootLong), Longitude, ShootLong))
Joining with `by = join_by(Hid)`
left_join: added 2 columns (ShootLat, ShootLong)
> rows only in missing_db2 314,439
> rows only in dplyr::select(bits_hh, .. ( 17,552)
> matched rows 29,670
> =========
> rows total 344,109
mutate: new variable 'lat' (double) with 356 unique values and 0% NA
new variable 'lon' (double) with 372 unique values and 0% NA
# now there is 581 unique coordinates instead of 11
missing_db3 |>
  dplyr::select(lat, lon) |> 
  distinct()
distinct: removed 343,528 rows (>99%), 581 rows remaining
# A tibble: 581 × 2
     lat   lon
   <dbl> <dbl>
 1  54.2  14.5
 2  54.2  15.5
 3  54.2  19.5
 4  54.8  14.5
 5  54.8  15.5
 6  54.8  19.5
 7  55.2  15.5
 8  55.2  16.5
 9  55.2  17.5
10  55.2  18.5
# ℹ 571 more rows

Fix prey weights

Fix prey lengths, counts and estimate weights based on taxonomic a and b.

missing_db3 |> # 819 prey weights out of 71 000 should be possible to fix
  dplyr::select(Prey_LowerLengthBound, Prey_UpperLengthBound, Prey_Weight, Prey_TotalNo) |>
  filter(is.na(Prey_Weight) | Prey_Weight == 0) |>
  filter((Prey_LowerLengthBound > 0 | Prey_UpperLengthBound > 0))
filter: removed 273,152 rows (79%), 70,957 rows remaining
filter: removed 70,138 rows (99%), 819 rows remaining
# A tibble: 819 × 4
   Prey_LowerLengthBound Prey_UpperLengthBound Prey_Weight Prey_TotalNo
                   <dbl>                 <dbl>       <dbl>        <dbl>
 1                 125                      NA          NA            1
 2                   8                      NA          NA         -999
 3                   8                      NA           0         -999
 4                   1.5                    NA           0         -999
 5                   1.5                    NA          NA         -999
 6                   3.9                    NA           0            1
 7                   6.3                    NA           0            1
 8                   3.7                    NA           0            1
 9                   4.2                    NA           0            1
10                   5.5                    NA           0            1
# ℹ 809 more rows
# fix prey lengths and weights. Estimate weight from length and count. If count is 0 or NA when length > 0, assume count=1.
missing_db3 |>  # 70 957 zeros out of which many are NA
  filter(Prey_Weight > 0) |>
  summarise(n()) 
filter: removed 70,957 rows (21%), 273,152 rows remaining
summarise: now one row and one column, ungrouped
# A tibble: 1 × 1
   `n()`
   <int>
1 273152
# rename and fix count, weight and lengths
missing_db4 <- missing_db3 |> 
  rename(prey_count = Prey_TotalNo,
         prey_weight = Prey_Weight) |>
  mutate(Prey_UpperLengthBound = replace_na(Prey_UpperLengthBound, 0),
         Prey_LowerLengthBound = replace_na(Prey_LowerLengthBound, 0),
         prey_length = if_else(Prey_UpperLengthBound == 0, Prey_LowerLengthBound, 
                               (Prey_UpperLengthBound + Prey_LowerLengthBound)/2, 
                               missing = NA),# there were few Upper (233)
         prey_count = replace_na(prey_count, 0),
         prey_weight = replace_na(prey_weight, 0),
         prey_weight_source = if_else(prey_weight > 0, "observed",
                                      if_else(prey_count > 0 & prey_length > 0, "estimated",
                                              if_else(prey_count == 0 & prey_length > 0, "estimated_zerocount", 
                                                      NA, missing = NA), 
                                              missing = NA)))
rename: renamed 2 variables (prey_count, prey_weight)
mutate: changed 80,919 values (24%) of 'prey_count' (80,919 fewer NAs)
        changed 36,390 values (11%) of 'prey_weight' (36,390 fewer NAs)
        changed 125,882 values (37%) of 'Prey_LowerLengthBound' (125,882 fewer NAs)
        changed 344,070 values (>99%) of 'Prey_UpperLengthBound' (344,070 fewer NAs)
        new variable 'prey_length' (double) with 348 unique values and 0% NA
        new variable 'prey_weight_source' (character) with 4 unique values and 20% NA
# join in taxonomy and a and b
old_db_preyTaxamatch_LW <- preyTaxamatch_LW |>
  dplyr::select("valid_AphiaID","valid_name","kingdom","phylum","class","order","family","genus","db_AphiaID","db_ScientificName","db_CleanScientificName","abTaxlevel","a","b") |>
  mutate(abTaxlevel = factor(abTaxlevel, Taxonomic_hierarchy)) |>
  arrange(db_ScientificName, abTaxlevel) |> # here we use db_sciname and not AID
  distinct(db_ScientificName, .keep_all = TRUE)
mutate: converted 'abTaxlevel' from character to factor (0 new NA)
distinct: removed 81 rows (13%), 523 rows remaining
# Before joining in as and bs and taxonomy, we first fix NA prey latin name using Stomach_Item where possible when latin name is Na and prey weight or length is positive (10 276)
missing_db4 |>
  filter(is.na(Prey_LatinName)) |> 
  filter(prey_length > 0 | prey_weight > 0) |> 
  summarise(fix = unique(Stomach_Item))
filter: removed 265,122 rows (77%), 78,987 rows remaining
filter: removed 68,711 rows (87%), 10,276 rows remaining
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
  always returns an ungrouped data frame and adjust accordingly.
ℹ The deprecated feature was likely used in the tidylog package.
  Please report the issue at <https://github.com/elbersb/tidylog/issues>.
summarise: now 64 rows and one column, ungrouped
# A tibble: 64 × 1
   fix                           
   <chr>                         
 1 Pontoporeia                   
 2 Unidentified fish             
 3 Fish eggs                     
 4 Algae                         
 5 Insect                        
 6 Clupeidae scales              
 7 Unidentified algae covered wit
 8 Unidentified mass             
 9 Unidentified remains          
10 Fucus                         
# ℹ 54 more rows
# where we lack latin name and there is a stomach item, use stomach item as latin name
missing_db5 <- missing_db4 |>
  mutate(Prey_LatinName = if_else(!is.na(Prey_LatinName) & !is.na(Stomach_Item), Prey_LatinName, Stomach_Item))
mutate: changed 10,663 values (3%) of 'Prey_LatinName' (9,933 fewer NAs)
# now join in as, bs and taxonomy
# missing_db5 |> filter(is.na(Prey_LatinName)) |> # No AID without latinnanme
#  distinct(Prey_AphiaID)
missing_db6 <- missing_db5 |> # note necessary argument na_matches
  left_join(old_db_preyTaxamatch_LW, join_by("Prey_LatinName"=="db_ScientificName"), na_matches = "never")
left_join: added 13 columns (valid_AphiaID, valid_name, kingdom, phylum, class, …)
           > rows only in missing_db5                  276
           > rows only in old_db_preyTaxamatch_LW (    357)
           > matched rows                          343,833
           >                                      =========
           > rows total                            344,109
# no a and b missing where we can fix prey weights
missing_db6 |> filter(is.na(a) & prey_weight == 0 & prey_length != 0) |>
  distinct(Prey_AphiaID, Prey_LatinName, .keep_all = TRUE)
filter: removed all rows (100%)
distinct: no rows removed
# A tibble: 0 × 58
# ℹ 58 variables: Dataset <chr>, Country <chr>, Ship <chr>, Latitude <dbl>,
#   Longitude <dbl>, Estimated_Lat_Lon <chr>, ICESrectangle <chr>, Year <dbl>,
#   Month <chr>, Day <chr>, Time <chr>, Station <chr>, HaulNo <chr>,
#   Sampling_Method <chr>, Depth <dbl>, Temperature <dbl>,
#   SampleNo(FishID) <dbl>, ICES_SampleID <dbl>, Predator_AphiaID <dbl>,
#   Predator_LatinName <chr>, Predator_Weight(mean) <dbl>,
#   Predator_Lengh(mean) <dbl>, Predator_CPUE <dbl>, Stomach_WithFood <dbl>, …
# Estimate weights based on length
missing_db7 <- missing_db6 |> 
  mutate(prey_weight = if_else(is.na(a) & is.na(b),
                               case_when(prey_weight_source == "estimated_zerocount" ~ (0.01*prey_length^3)*1,
                                         prey_weight_source == "estimated" ~ (0.01*prey_length^3)*prey_count,
                                         .default = prey_weight),
                               case_when(prey_weight_source == "estimated_zerocount" ~ (a*prey_length^b)*1,
                                         prey_weight_source == "estimated" ~ (a*prey_length^b)*prey_count,
                                         .default = prey_weight)),
         prey_weight_ind = if_else(prey_count > 0 & prey_weight > 0, prey_weight / prey_count, NA))
mutate: changed 763 values (<1%) of 'prey_weight' (0 new NAs)
        new variable 'prey_weight_ind' (double) with 13,623 unique values and 51% NA
table(missing_db7$prey_weight_source, useNA = "ifany") # 70194 zeros where we cant estimate weight (equal to initial calculations at top of chunk)

          estimated estimated_zerocount            observed                <NA> 
                578                 185              273152               70194 
# What to do with 70 000 missing weights. We can give them average weight like for the new db. 
missing_db7 |> # where there is a prey species IDd, there are only ~1000 zero weights that we can fix. Here, I assume count Na/0 is zero. 
  filter(!is.na(Prey_AphiaID) & prey_weight == 0 & prey_count >= 0)
filter: removed 343,047 rows (>99%), 1,062 rows remaining
# A tibble: 1,062 × 59
   Dataset      Country Ship  Latitude Longitude Estimated_Lat_Lon ICESrectangle
   <chr>        <chr>   <chr>    <dbl>     <dbl> <chr>             <chr>        
 1 Stomach dat… DK      DAN2      55.2      15.5 <NA>              39G5         
 2 Stomach dat… PL      BAL       55.2      16.5 <NA>              39G6         
 3 Stomach dat… PL      BAL       55.2      16.5 <NA>              39G6         
 4 Stomach dat… DK      DAN2      55.2      15.5 <NA>              39G5         
 5 Stomach dat… LV      MAZ       54.8      15.5 <NA>              38G5         
 6 Stomach dat… DK      DAN2      55.8      15.5 <NA>              40G5         
 7 Stomach dat… LV      CLV       57.8      22.5 <NA>              44H2         
 8 Stomach dat… LV      CLV       56.2      20.5 <NA>              41H0         
 9 Stomach dat… LV      CLV       56.2      20.5 <NA>              41H0         
10 Stomach dat… LV      CLV       55.8      20.5 <NA>              40H0         
# ℹ 1,052 more rows
# ℹ 52 more variables: Year <dbl>, Month <chr>, Day <chr>, Time <chr>,
#   Station <chr>, HaulNo <chr>, Sampling_Method <chr>, Depth <dbl>,
#   Temperature <dbl>, `SampleNo(FishID)` <dbl>, ICES_SampleID <dbl>,
#   Predator_AphiaID <dbl>, Predator_LatinName <chr>,
#   `Predator_Weight(mean)` <dbl>, `Predator_Lengh(mean)` <dbl>,
#   Predator_CPUE <dbl>, Stomach_WithFood <dbl>, Stomach_Regurgitated <dbl>, …
# To control for outliers affecting average of individual weight, remove outliers outside quartiles
prey_avg_ind_weight_odb <- missing_db7 |> #prey_avg_ind_weight_odb |>
  filter(!is.na(Prey_AphiaID)) |> 
  filter(!prey_weight_ind %in% boxplot.stats(prey_weight_ind)$out, .by = Prey_AphiaID) |>
  reframe(avg_weight = mean(prey_weight_ind, na.rm = TRUE), .by = Prey_AphiaID)
filter: removed 79,498 rows (23%), 264,611 rows remaining
filter: removed 18,107 rows (7%), 246,504 rows remaining
# NaNs are produced despite na.rm=TRUE and nas in Prey_AphiaID are removed
prey_avg_ind_weight_odb |> 
  filter(is.na(avg_weight))
filter: removed 114 rows (92%), 10 rows remaining
# A tibble: 10 × 2
   Prey_AphiaID avg_weight
          <dbl>      <dbl>
 1       101361        NaN
 2         1066        NaN
 3         1080        NaN
 4       120020        NaN
 5       120178        NaN
 6       126508        NaN
 7       127143        NaN
 8       127196        NaN
 9         1307        NaN
10       134958        NaN
#set the NaNs to 0
prey_avg_ind_weight_odb <- prey_avg_ind_weight_odb |> 
  mutate(avg_weight = replace_na(avg_weight, 0))
mutate: changed 10 values (8%) of 'avg_weight' (10 fewer NAs)
# Join average weight and estimate weight
missing_db8 <- missing_db7 |> 
  left_join(prey_avg_ind_weight_odb) |> 
  mutate(prey_weight = if_else(prey_weight <= 0 & prey_count > 0 & !is.na(Prey_AphiaID), prey_count * avg_weight, prey_weight))
Joining with `by = join_by(Prey_AphiaID)`
left_join: added one column (avg_weight)
> rows only in missing_db7 79,498
> rows only in prey_avg_ind_weight_odb ( 0)
> matched rows 264,611
> =========
> rows total 344,109
mutate: changed 569 values (<1%) of 'prey_weight' (0 new NAs)
missing_db8 |>  # 69 621 zeros out of which many come from NAs
  filter(prey_weight == 0)
filter: removed 274,484 rows (80%), 69,625 rows remaining
# A tibble: 69,625 × 60
   Dataset      Country Ship  Latitude Longitude Estimated_Lat_Lon ICESrectangle
   <chr>        <chr>   <chr>    <dbl>     <dbl> <chr>             <chr>        
 1 Stomach dat… LV      MAZ       54.2      14.5 <NA>              37G4         
 2 Stomach dat… LV      UNKN      54.2      15.5 <NA>              37G5         
 3 Stomach dat… LV      UNKN      54.2      15.5 <NA>              37G5         
 4 Stomach dat… LV      UNKN      54.2      15.5 <NA>              37G5         
 5 Stomach dat… LV      UNKN      54.2      15.5 <NA>              37G5         
 6 Stomach dat… LV      ZBA       54.2      15.5 <NA>              37G5         
 7 Stomach dat… LV      ZBA       54.2      15.5 <NA>              37G5         
 8 Stomach dat… LV      MAZ       54.2      19.5 <NA>              37G9         
 9 Stomach dat… LV      MAZ       54.2      19.5 <NA>              37G9         
10 Stomach dat… LV      MAZ       54.2      19.5 <NA>              37G9         
# ℹ 69,615 more rows
# ℹ 53 more variables: Year <dbl>, Month <chr>, Day <chr>, Time <chr>,
#   Station <chr>, HaulNo <chr>, Sampling_Method <chr>, Depth <dbl>,
#   Temperature <dbl>, `SampleNo(FishID)` <dbl>, ICES_SampleID <dbl>,
#   Predator_AphiaID <dbl>, Predator_LatinName <chr>,
#   `Predator_Weight(mean)` <dbl>, `Predator_Lengh(mean)` <dbl>,
#   Predator_CPUE <dbl>, Stomach_WithFood <dbl>, Stomach_Regurgitated <dbl>, …

Make prey groups ODB

# Make prey groups 
missing_db9 <- missing_db8 |> # arguments are evaluated in order so proceed from specific to general
  mutate(prey_group = case_when(valid_name == "Clupea harengus"~ "herring",
                                valid_name == "Sprattus sprattus"~ "sprat",
                                valid_name == "Saduria entomon" ~ "saduria",
                                family == "Clupeidae" ~ "unidd_clupeids", # temporary, see below
                                #is.na(valid_name) ~ "other",
                                phylum == "Chordata"~ "other_chords",
                                kingdom == "Animalia" & !phylum == "Chordata" ~ "other_inverts",
                                .default = "other"))
mutate: new variable 'prey_group' (character) with 7 unique values and 0% NA
# seems to work
missing_db9 |>
  summarise(n = n(), .by= prey_group)
summarise: now 7 rows and 2 columns, ungrouped
# A tibble: 7 × 2
  prey_group          n
  <chr>           <int>
1 other           69334
2 other_inverts  120172
3 other_chords    14160
4 unidd_clupeids   1601
5 herring         16601
6 sprat           46381
7 saduria         75860
missing_db9 |> # only NA prey ids and Fucus in the other group. We set all non bio to NA in the get taxonomy chunk.
  filter(prey_group == "other") |>
  summarise(n = n(), .by = db_CleanScientificName)
filter: removed 274,775 rows (80%), 69,334 rows remaining
summarise: now 2 rows and 2 columns, ungrouped
# A tibble: 2 × 2
  db_CleanScientificName     n
  <chr>                  <int>
1 <NA>                   69330
2 Fucus                      4
# The NAs
missing_db9 |> 
  filter(is.na(prey_group))
filter: removed all rows (100%)
# A tibble: 0 × 61
# ℹ 61 variables: Dataset <chr>, Country <chr>, Ship <chr>, Latitude <dbl>,
#   Longitude <dbl>, Estimated_Lat_Lon <chr>, ICESrectangle <chr>, Year <dbl>,
#   Month <chr>, Day <chr>, Time <chr>, Station <chr>, HaulNo <chr>,
#   Sampling_Method <chr>, Depth <dbl>, Temperature <dbl>,
#   SampleNo(FishID) <dbl>, ICES_SampleID <dbl>, Predator_AphiaID <dbl>,
#   Predator_LatinName <chr>, Predator_Weight(mean) <dbl>,
#   Predator_Lengh(mean) <dbl>, Predator_CPUE <dbl>, Stomach_WithFood <dbl>, …
# only fish and a few Gnathostomata in other Chordates 
missing_db9 |>  
  filter(prey_group %in% c("other_chords")) |>
  summarise(n= n(), .by = valid_name)
filter: removed 329,949 rows (96%), 14,160 rows remaining
summarise: now 50 rows and 2 columns, ungrouped
# A tibble: 50 × 2
   valid_name                 n
   <chr>                  <int>
 1 Osteichthyes            1299
 2 Gadus morhua            1215
 3 Trachurus                  5
 4 Enchelyopus cimbrius     778
 5 Pleuronectidae            69
 6 Merlangius merlangus       3
 7 Gasterosteoidei          277
 8 Gasterosteus aculeatus   172
 9 Pungitius pungitius        1
10 Callionymus               34
# ℹ 40 more rows
# only invert phyla
missing_db9 |>  
  filter(prey_group %in% c("other_inverts")) |>
  summarise(n= n(), .by = phylum)
filter: removed 223,937 rows (65%), 120,172 rows remaining
summarise: now 7 rows and 2 columns, ungrouped
# A tibble: 7 × 2
  phylum            n
  <chr>         <int>
1 Arthropoda    93700
2 Annelida      24796
3 Mollusca        708
4 Priapulida      929
5 Echinodermata     7
6 Rotifera         14
7 Cnidaria         18
# Perfect apart from that in the other group there are some animal prey that is classified as animalia: "Zooplancton","Unidentified invertebrata","Unidentified worm". We choose to live with having these as "other" and not as invertebrates. 
missing_db9 |> 
  filter(prey_group == "other",
         prey_weight > 0) |>
  summarise(n = n(), .by = Prey_LatinName)
filter: removed 343,797 rows (>99%), 312 rows remaining
summarise: now 23 rows and 2 columns, ungrouped
# A tibble: 23 × 2
   Prey_LatinName                     n
   <chr>                          <int>
 1 Algae                             61
 2 Unidentified algae covered wit     2
 3 Unidentified mass                  9
 4 Unidentified remains               4
 5 Fucus                              4
 6 Stone                            109
 7 Spawn                              2
 8 Waste                             12
 9 Siphon                             4
10 Plastic                           24
# ℹ 13 more rows
# Duplicates?
dup <- missing_db9 |>
  group_by(Haul_ID) |> 
  dplyr::select(-Dataset, -ICES_SampleID) |>
  duplicated() |> 
  which() 
group_by: one grouping variable (Haul_ID)
# remove duplicates
missing_db9 <- missing_db9[-dup,]

Sum prey group weights per predator

# remove zero weight predators and prey that are to no use, then make predator identifier. Remove NA prey weights
missing_db10 <- missing_db9 |>
  rename(pred_weight = `Predator_Weight(mean)`,
         pred_length = `Predator_Lengh(mean)`) |>
  filter( pred_weight > 0 | pred_length > 0) |>
  mutate(pred_ID = paste(Haul_ID, `SampleNo(FishID)`, pred_weight, pred_length, sep = "_")) # both weight and length needed to get all fish a unique ID.
rename: renamed 2 variables (pred_weight, pred_length)
filter: removed 82 rows (<1%), 343,881 rows remaining
mutate: new variable 'pred_ID' (character) with 97,412 unique values and 0% NA
# In the old db (which is not the case for the new db), we have both pred and prey data. We can thus use mutate() and distinct() instead of summarise(). 
missing_db10_sum <- missing_db10 |> 
  mutate(prey_tot_weight = sum(prey_weight), .by = c(pred_ID, prey_group)) |>
  distinct(pred_ID, prey_group, .keep_all = TRUE)
mutate: new variable 'prey_tot_weight' (double) with 13,231 unique values and 0% NA
distinct: removed 218,468 rows (64%), 125,413 rows remaining
#  summarise(tot_weight = sum(prey_weight), .by = c(pred_ID, prey_group))

# we have preserved the predators
length(unique(missing_db10$pred_ID)) 
[1] 97412
length(unique(missing_db10_sum$pred_ID)) 
[1] 97412
# make wide format, the id_cols argument is necessary to identify the columns to keep in the wide
missing_db10_sum2 <- missing_db10_sum |> 
  pivot_wider( id_cols = c(Country,Estimated_Lat_Lon,ICESrectangle,Year,Month,Day,pred_weight,pred_length,Haul_ID,lat,lon,pred_ID,Depth), names_from = "prey_group", values_from = "prey_tot_weight", values_fill = 0)
pivot_wider: reorganized (Dataset, Ship, Latitude, Longitude, Time, …) into
(other, other_inverts, other_chords, unidd_clupeids, herring, …) [was
125413x63, now 97412x20]
# no duplicates 
missing_db10_sum2 |> summarise(count = n(), .by = pred_ID) |> filter(count > 1)
summarise: now 97,412 rows and 2 columns, ungrouped
filter: removed all rows (100%)
# A tibble: 0 × 2
# ℹ 2 variables: pred_ID <chr>, count <int>
# Split unidentified clupeids on sprat and herring  
missing_db10_sum3 <- missing_db10_sum2 |> 
  mutate(sprat = sprat + unidd_clupeids*0.5,
         herring = herring + unidd_clupeids*0.5) |>
  dplyr::select(-unidd_clupeids)
mutate: changed 919 values (1%) of 'herring' (0 new NAs)
        changed 919 values (1%) of 'sprat' (0 new NAs)
# identify if theres is na in the prey group columns
missing_db10_sum3 |> # no there isnt
  map(\(.) sum(is.na(.))) 
$Country
[1] 0

$Estimated_Lat_Lon
[1] 97412

$ICESrectangle
[1] 0

$Year
[1] 0

$Month
[1] 0

$Day
[1] 622

$pred_weight
[1] 90900

$pred_length
[1] 0

$Haul_ID
[1] 0

$lat
[1] 0

$lon
[1] 0

$pred_ID
[1] 0

$Depth
[1] 97412

$other
[1] 0

$other_inverts
[1] 0

$other_chords
[1] 0

$herring
[1] 0

$sprat
[1] 0

$saduria
[1] 0
missing_db10_sum3 |> 
  filter(is.na(other_chords)) 
filter: removed all rows (100%)
# A tibble: 0 × 19
# ℹ 19 variables: Country <chr>, Estimated_Lat_Lon <chr>, ICESrectangle <chr>,
#   Year <dbl>, Month <chr>, Day <chr>, pred_weight <dbl>, pred_length <dbl>,
#   Haul_ID <chr>, lat <dbl>, lon <dbl>, pred_ID <chr>, Depth <dbl>,
#   other <dbl>, other_inverts <dbl>, other_chords <dbl>, herring <dbl>,
#   sprat <dbl>, saduria <dbl>
# replace na with 0
missing_db10_sum4 <- missing_db10_sum3 |>
  mutate(other = replace_na(other,0),
         other_inverts = replace_na(other_inverts,0),
         other_chords = replace_na(other_chords,0))
mutate: no changes
#colnames(missing_db12_sum) #select columns without prey info .

old_data <- missing_db10_sum4 |> 
  dplyr::select(Country,Estimated_Lat_Lon,ICESrectangle,Year,Month,Day,pred_weight,pred_length,Haul_ID,lat,lon,pred_ID,other_inverts,other_chords,other,herring,sprat,saduria,Depth)

old_data |> # there are unlikely extremes. These are filtered on relative prey weight in the analysis.
  rowwise() |>
  mutate(total = sum(other, other_inverts, saduria, herring, sprat, other_chords)) |>
  filter( total > 1000) |>
  arrange(desc(total)) |>
  ungroup()
mutate: new variable 'total' (double) with 15,297 unique values and 0% NA
filter: removed 97,155 rows (>99%), 257 rows remaining
ungroup: no grouping variables remain
# A tibble: 257 × 20
   Country Estimated_Lat_Lon ICESrectangle  Year Month Day   pred_weight
   <chr>   <chr>             <chr>         <dbl> <chr> <chr>       <dbl>
 1 LV      <NA>              43H0           1980 4     19             NA
 2 LV      <NA>              44H3           1980 12    19             NA
 3 LV      <NA>              44H1           1981 6     18             NA
 4 DK      <NA>              39G7           2009 11    14           1274
 5 LV      <NA>              40G8           1978 3     23             NA
 6 LV      <NA>              44H1           1981 6     18             NA
 7 LV      <NA>              38G9           1985 3     16             NA
 8 LV      <NA>              44H1           1977 10    3              NA
 9 LV      <NA>              44H1           1982 9     9              NA
10 LV      <NA>              39G8           1970 1     26             NA
# ℹ 247 more rows
# ℹ 13 more variables: pred_length <dbl>, Haul_ID <chr>, lat <dbl>, lon <dbl>,
#   pred_ID <chr>, other_inverts <dbl>, other_chords <dbl>, other <dbl>,
#   herring <dbl>, sprat <dbl>, saduria <dbl>, Depth <dbl>, total <dbl>
old_data |> # there are unlikely extremes. These are filtered on relative prey weight in the analysis.
  rowwise() |>
  mutate(total = sum(other, other_inverts, saduria, herring, sprat, other_chords),
         empty_stom = ifelse(total == 0, 1, 0)) |>
  ungroup() |> 
  summarise(empty_n = n(), .by = empty_stom) # 35 000 empty stomachs... 63 000 not empty
mutate: new variable 'total' (double) with 15,297 unique values and 0% NA
        new variable 'empty_stom' (double) with 2 unique values and 0% NA
ungroup: no grouping variables remain
summarise: now 2 rows and 2 columns, ungrouped
# A tibble: 2 × 2
  empty_stom empty_n
       <dbl>   <int>
1          1   34598
2          0   62814

5. New SE data

This data has already been prepared/cleaned to som extent in another project

# Calculate total weight of prey by predator ID and prey species (i.e., across prey sizes). First create wide data frame so that I can sum easily across prey groups (columns)
SE_preyTaxamatch_LW <- preyTaxamatch_LW |>
  dplyr::select(valid_AphiaID,valid_name,kingdom,phylum,class,order,family,genus,db_AphiaID,db_ScientificName,db_CleanScientificName,abTaxlevel,a,b) |>
  mutate(abTaxlevel = factor(abTaxlevel, Taxonomic_hierarchy)) |>
  arrange(db_ScientificName, abTaxlevel) |> # here we use db_sciname and not AID
  distinct(db_ScientificName, .keep_all = TRUE)
mutate: converted 'abTaxlevel' from character to factor (0 new NA)
distinct: removed 81 rows (13%), 523 rows remaining
# Join in as, bs and taxonomy
SE2 <- SE |> # note necessary argument na_matches
  left_join(SE_preyTaxamatch_LW, join_by("prey_latin_name"=="db_ScientificName"), na_matches = "never")
left_join: added 13 columns (valid_AphiaID, valid_name, kingdom, phylum, class, …)
           > rows only in SE                      973
           > rows only in SE_preyTaxamatch_LW (   445)
           > matched rows                      91,895
           >                                  ========
           > rows total                        92,868
# there are no prey to fix here!
SE2 |>
  filter(prey_weight_g == 0 & prey_length_cm > 0) |>
  distinct(prey_latin_name)
filter: removed all rows (100%)
distinct: no rows removed
# A tibble: 0 × 1
# ℹ 1 variable: prey_latin_name <chr>
# Make prey classes
SE3 <- SE2 |> # arguments are evaluated in order so proceed from specific to general
  mutate(prey_group = case_when(valid_name == "Clupea harengus"~ "herring",
                                valid_name == "Sprattus sprattus"~ "sprat",
                                valid_name == "Saduria entomon" ~ "saduria",
                                family == "Clupeidae" ~ "unidd_clupeids", # temporary, see below
                                phylum == "Chordata"~ "other_chords",
                                kingdom == "Animalia" & !phylum == "Chordata" ~ "other_inverts",
                                .default = "other"))
mutate: new variable 'prey_group' (character) with 7 unique values and 0% NA
SE3 |> summarise(n = n(), .by= prey_group)
summarise: now 7 rows and 2 columns, ungrouped
# A tibble: 7 × 2
  prey_group         n
  <chr>          <int>
1 sprat           1520
2 herring          572
3 other_chords    3433
4 saduria         6909
5 unidd_clupeids   448
6 other_inverts  76639
7 other           3347
SE4 <- SE3 |>
  rename(prey_weight = prey_weight_g,
         pred_weight = pred_weight_g,
         pred_length = pred_length_cm)
rename: renamed 3 variables (pred_weight, pred_length, prey_weight)
# Summarise total prey weight per predator and prey gorup
SE4_sum <- SE4 |> 
  mutate(prey_tot_weight = sum(prey_weight), .by = c(pred_id, prey_group)) |>
  distinct(pred_id, prey_group, .keep_all = TRUE)
mutate: new variable 'prey_tot_weight' (double) with 3,060 unique values and 0% NA
distinct: removed 79,751 rows (86%), 13,117 rows remaining
##  summarise(tot_weight = sum(prey_weight), .by = c(pred_ID, prey_group))

# make wide format
SE4_sum2 <- SE4_sum |> 
  pivot_wider( id_cols = c(pred_id,year,month,pred_weight,pred_length,haul_id,lon,lat,date,subdiv,ices_rect,predator_latin_name,bottom_depth), names_from = "prey_group", values_from = "prey_tot_weight", values_fill = 0)
pivot_wider: reorganized (gall_bladder, prey_weight, prey_latin_name,
predator_code, quarter, …) into (sprat, herring, other_chords, saduria,
unidd_clupeids, …) [was 13117x53, now 9747x20]
# SE4_sum2 |> distinct(pred_id) |>  summarise(n=n())

# Split unidentified clupeids on sprat and herring  
SE4_sum3 <- SE4_sum2 |> 
  mutate(sprat = sprat + unidd_clupeids*0.5,
         herring = herring + unidd_clupeids*0.5) |>
  dplyr::select(-unidd_clupeids)
mutate: changed 344 values (4%) of 'sprat' (0 new NAs)
        changed 344 values (4%) of 'herring' (0 new NAs)
SE4_sum3 |> # no na in prey groups
  map(\(.) sum(is.na(.)))
$pred_id
[1] 0

$year
[1] 0

$month
[1] 0

$pred_weight
[1] 0

$pred_length
[1] 0

$haul_id
[1] 0

$lon
[1] 0

$lat
[1] 0

$date
[1] 0

$subdiv
[1] 0

$ices_rect
[1] 289

$predator_latin_name
[1] 289

$bottom_depth
[1] 289

$sprat
[1] 0

$herring
[1] 0

$other_chords
[1] 0

$saduria
[1] 0

$other_inverts
[1] 0

$other
[1] 0

6. Merge data sets

# rename ids and add a column to separate new from old database
new_db_data <- new_data |> 
  rename(lat = ShootLat,
         lon = ShootLong) |>
  mutate(#Haul_ID_tbl = as.character(tblHaulID),
         Haul_ID = paste(Country, Year, Month, Day, HaulNo, ICESrectangle, sep = "_"),
         pred_ID = as.character(tblPredatorInformationID),
         data_source = "new_db")
rename: renamed 2 variables (lat, lon)
mutate: new variable 'Haul_ID' (character) with 1,642 unique values and 0% NA
        new variable 'pred_ID' (character) with 35,476 unique values and 0% NA
        new variable 'data_source' (character) with one unique value and 0% NA
old_db_data <- old_data |> 
  mutate(Day = as.double(Day),
         Month = as.double(Month),
         data_source = "old_db")
mutate: converted 'Month' from character to double (0 new NA)
        converted 'Day' from character to double (0 new NA)
        new variable 'data_source' (character) with one unique value and 0% NA
names(SE4_sum3)
 [1] "pred_id"             "year"                "month"              
 [4] "pred_weight"         "pred_length"         "haul_id"            
 [7] "lon"                 "lat"                 "date"               
[10] "subdiv"              "ices_rect"           "predator_latin_name"
[13] "bottom_depth"        "sprat"               "herring"            
[16] "other_chords"        "saduria"             "other_inverts"      
[19] "other"              
new_SE_data <- SE4_sum3 |>
  filter(str_detect(pred_id, "COD")) |>
  #filter(predator_latin_name %in% "Gadus morhua") |>
  rename(pred_ID = pred_id,
         Depth = bottom_depth,
         ICESrectangle = ices_rect) |>
  separate(haul_id,
           sep = "_",
           into = c("Year", "Quarter", "HaulNo"),
           remove = TRUE) |>
  dplyr::select(!Year) |>
  rename(Year = year) |> 
  mutate(Country = "SE",
         data_source = "new_SE",
         Day = day(date),
         Month = month,
         ICESrectangle = if_else(is.na(ICESrectangle), ices.rect2(lon, lat), ICESrectangle),
         Haul_ID = paste(Country, Year, Month, Day, HaulNo, ICESrectangle, sep = "_"))
filter: removed 3,882 rows (40%), 5,865 rows remaining
rename: renamed 3 variables (pred_ID, ICESrectangle, Depth)
rename: renamed one variable (Year)
mutate: changed 289 values (5%) of 'ICESrectangle' (289 fewer NAs)
        new variable 'Country' (character) with one unique value and 0% NA
        new variable 'data_source' (character) with one unique value and 0% NA
        new variable 'Day' (integer) with 20 unique values and 0% NA
        new variable 'Month' (double) with 3 unique values and 0% NA
        new variable 'Haul_ID' (character) with 332 unique values and 0% NA
comcol_all_data_pred <- intersect( intersect(colnames(new_db_data), colnames(old_db_data)),
                                   colnames(new_SE_data))

bind_rows( new_db_data |> dplyr::select(Year, pred_ID, data_source), 
           old_db_data |> dplyr::select(Year, pred_ID, data_source),
           new_SE_data |> dplyr::select(Year, pred_ID, data_source) ) |>
  summarise(count = n(), .by = c(Year, data_source)) |>
  ggplot() +
  geom_bar(aes(Year, count, fill = data_source), alpha = 1, stat="identity", position = "stack") +
  scale_x_continuous(n.breaks = 10) + ylab("# stomachs")
summarise: now 106 rows and 3 columns, ungrouped
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.

# Check if any of the two datasets have NA in the common columns. 
unique(is.na(old_db_data |> dplyr::select(all_of(comcol_all_data_pred)))) # Depth is all NAs but we dont need it. Day we may need however and these stomachs need to be removed in case day is a predictor. Consider also when adding in environmental data. Pred_weight will be filled in section 7.
     other_inverts saduria other_chords sprat herring other  Year Month   Day
[1,]         FALSE   FALSE        FALSE FALSE   FALSE FALSE FALSE FALSE FALSE
[2,]         FALSE   FALSE        FALSE FALSE   FALSE FALSE FALSE FALSE FALSE
[3,]         FALSE   FALSE        FALSE FALSE   FALSE FALSE FALSE FALSE  TRUE
     pred_weight pred_length   lat   lon ICESrectangle Depth Country Haul_ID
[1,]        TRUE       FALSE FALSE FALSE         FALSE  TRUE   FALSE   FALSE
[2,]       FALSE       FALSE FALSE FALSE         FALSE  TRUE   FALSE   FALSE
[3,]       FALSE       FALSE FALSE FALSE         FALSE  TRUE   FALSE   FALSE
     pred_ID data_source
[1,]   FALSE       FALSE
[2,]   FALSE       FALSE
[3,]   FALSE       FALSE
unique(is.na(new_db_data |> dplyr::select(all_of(comcol_all_data_pred))))
     other_inverts saduria other_chords sprat herring other  Year Month   Day
[1,]         FALSE   FALSE        FALSE FALSE   FALSE FALSE FALSE FALSE FALSE
[2,]         FALSE   FALSE        FALSE FALSE   FALSE FALSE FALSE FALSE FALSE
     pred_weight pred_length   lat   lon ICESrectangle Depth Country Haul_ID
[1,]       FALSE       FALSE FALSE FALSE         FALSE FALSE   FALSE   FALSE
[2,]        TRUE       FALSE FALSE FALSE         FALSE FALSE   FALSE   FALSE
     pred_ID data_source
[1,]   FALSE       FALSE
[2,]   FALSE       FALSE
unique(is.na(new_SE_data |> dplyr::select(all_of(comcol_all_data_pred))))
     other_inverts saduria other_chords sprat herring other  Year Month   Day
[1,]         FALSE   FALSE        FALSE FALSE   FALSE FALSE FALSE FALSE FALSE
[2,]         FALSE   FALSE        FALSE FALSE   FALSE FALSE FALSE FALSE FALSE
     pred_weight pred_length   lat   lon ICESrectangle Depth Country Haul_ID
[1,]       FALSE       FALSE FALSE FALSE         FALSE  TRUE   FALSE   FALSE
[2,]       FALSE       FALSE FALSE FALSE         FALSE FALSE   FALSE   FALSE
     pred_ID data_source
[1,]   FALSE       FALSE
[2,]   FALSE       FALSE
str(old_db_data)
tibble [97,412 × 20] (S3: tbl_df/tbl/data.frame)
 $ Country          : chr [1:97412] "LV" "LV" "LV" "LV" ...
 $ Estimated_Lat_Lon: chr [1:97412] NA NA NA NA ...
 $ ICESrectangle    : chr [1:97412] "37G4" "37G5" "37G5" "37G5" ...
 $ Year             : num [1:97412] 1968 1973 1973 1973 1973 ...
 $ Month            : num [1:97412] 9 5 5 5 5 12 12 1 1 1 ...
 $ Day              : num [1:97412] 19 11 11 11 11 16 16 22 22 22 ...
 $ pred_weight      : num [1:97412] NA NA NA NA NA NA NA NA NA NA ...
 $ pred_length      : num [1:97412] 25 17 17 18 16 60 77 18 25 19 ...
 $ Haul_ID          : chr [1:97412] "LV_1968_9_19_1_37G4" "LV_1973_5_11_15_37G5" "LV_1973_5_11_15_37G5" "LV_1973_5_11_15_37G5" ...
 $ lat              : num [1:97412] 54.2 54.2 54.2 54.2 54.2 ...
 $ lon              : num [1:97412] 14.5 15.5 15.5 15.5 15.5 15.5 15.5 19.5 19.5 19.5 ...
 $ pred_ID          : chr [1:97412] "LV_1968_9_19_1_37G4_52_NA_25" "LV_1973_5_11_15_37G5_100_NA_17" "LV_1973_5_11_15_37G5_69_NA_17" "LV_1973_5_11_15_37G5_74_NA_18" ...
 $ other_inverts    : num [1:97412] 0 0 0 0 0 0 0 0 0 0 ...
 $ other_chords     : num [1:97412] 0 0 0 0 0 0 0 0 0 0 ...
 $ other            : num [1:97412] 0 0 0 0 0 0 0 0 0 0 ...
 $ herring          : num [1:97412] 0 0 0 0 0 0 0 0 0 0 ...
 $ sprat            : num [1:97412] 0 0 0 0 0 0 0 0 0 0 ...
 $ saduria          : num [1:97412] 0 0 0 0 0 0 0 0 0 0 ...
 $ Depth            : num [1:97412] NA NA NA NA NA NA NA NA NA NA ...
 $ data_source      : chr [1:97412] "old_db" "old_db" "old_db" "old_db" ...
str(new_db_data)
tibble [35,476 × 50] (S3: tbl_df/tbl/data.frame)
 $ tblPredatorInformationID: num [1:35476] 55095 55096 55097 55098 55099 ...
 $ other_inverts           : num [1:35476] 0.12 0.07 0.06 0.06 0.03 0.04 0.15 0.38 0.015 0.053 ...
 $ saduria                 : num [1:35476] 0 0 0 0 0 0 0 0 0 0 ...
 $ other_chords            : num [1:35476] 0 0 0 0 0 0 0 0 0 0 ...
 $ sprat                   : num [1:35476] 0 0 0 0 0 0 0 0 0 0 ...
 $ herring                 : num [1:35476] 0 0 0 0 0 0 0 0 0 0 ...
 $ other                   : num [1:35476] 0 0 0 0 0 0 0 0 0 0 ...
 $ tblUploadID             : num [1:35476] 8247 8247 8247 8247 8247 ...
 $ tblHaulID               : num [1:35476] 4701 4702 4702 4702 4702 ...
 $ Ship                    : chr [1:35476] "90MZ" "90MZ" "90MZ" "90MZ" ...
 $ Gear                    : chr [1:35476] "LBT" "LBT" "LBT" "LBT" ...
 $ HaulNo                  : num [1:35476] 1 14 14 14 14 14 30 30 1 1 ...
 $ StationNumber           : num [1:35476] 1 1 1 1 1 1 1 1 3 3 ...
 $ Year                    : num [1:35476] 1969 1969 1969 1969 1969 ...
 $ Month                   : num [1:35476] 1 1 1 1 1 1 1 1 3 3 ...
 $ Day                     : num [1:35476] 8 19 19 19 19 19 24 24 15 15 ...
 $ Time                    : chr [1:35476] "0710" "0720" "0720" "0720" ...
 $ FishID                  : num [1:35476] 95 1 2 3 5 6 31 32 82 83 ...
 $ AphiaIDPredator         : num [1:35476] 126436 126436 126436 126436 126436 ...
 $ pred_weight             : num [1:35476] 15.6 2.26 1.7 1.04 1.92 1.8 22.5 23.7 1.35 2.69 ...
 $ Number                  : num [1:35476] 1 1 1 1 1 1 1 1 1 1 ...
 $ MeasurementIncrement    : num [1:35476] 1 1 1 1 1 1 1 1 1 1 ...
 $ pred_length             : num [1:35476] 13 8 8 6 6 6 15 15 6 7 ...
 $ Code                    : chr [1:35476] NA NA NA NA ...
 $ Age                     : num [1:35476] NA NA NA NA NA NA NA NA NA NA ...
 $ Sex                     : chr [1:35476] NA NA NA NA ...
 $ MaturityScale           : chr [1:35476] NA NA NA NA ...
 $ MaturityStage           : num [1:35476] NA NA NA NA NA NA NA NA NA NA ...
 $ PreservationMethod      : chr [1:35476] "NBF" "NBF" "NBF" "NBF" ...
 $ Regurgitated            : num [1:35476] 0 0 0 0 0 0 0 0 0 0 ...
 $ StomachFullness         : logi [1:35476] NA NA NA NA NA NA ...
 $ FullStomWgt             : logi [1:35476] NA NA NA NA NA NA ...
 $ EmptyStomWgt            : logi [1:35476] NA NA NA NA NA NA ...
 $ StomachEmpty            : num [1:35476] 0 0 0 0 0 0 0 0 0 0 ...
 $ GenSamp                 : chr [1:35476] "N" "N" "N" "N" ...
 $ lat                     : num [1:35476] 57.5 57.2 57.2 57.2 57.2 ...
 $ lon                     : num [1:35476] 21.1 21.1 21.1 21.1 21.1 ...
 $ HaulLat                 : num [1:35476] NA NA NA NA NA NA NA NA NA NA ...
 $ HaulLong                : num [1:35476] NA NA NA NA NA NA NA NA NA NA ...
 $ ICESrectangle           : chr [1:35476] "43H1" "43H1" "43H1" "43H1" ...
 $ Depth                   : num [1:35476] 70 26 26 26 26 26 78 78 48 48 ...
 $ Survey                  : chr [1:35476] NA NA NA NA ...
 $ ICESDatabase            : chr [1:35476] "N" "N" "N" "N" ...
 $ Country                 : chr [1:35476] "LV" "LV" "LV" "LV" ...
 $ Reporting_organisation  : num [1:35476] 3140 3140 3140 3140 3140 3140 3140 3140 3140 3140 ...
 $ CruiseID                : chr [1:35476] "LV314090MZ1969" "LV314090MZ1969" "LV314090MZ1969" "LV314090MZ1969" ...
 $ Regurgitated_st         : num [1:35476] 0 0 0 0 0 0 0 0 0 0 ...
 $ Haul_ID                 : chr [1:35476] "LV_1969_1_8_1_43H1" "LV_1969_1_19_14_43H1" "LV_1969_1_19_14_43H1" "LV_1969_1_19_14_43H1" ...
 $ pred_ID                 : chr [1:35476] "55095" "55096" "55097" "55098" ...
 $ data_source             : chr [1:35476] "new_db" "new_db" "new_db" "new_db" ...
# remove Time, Depth and Ship (ship makes w-o columns .x and .y)
all_data <- full_join( old_db_data |> dplyr::select(-c(Depth)), # join in predator data
                       new_db_data |> dplyr::select(-c(Depth, Ship)),
                       by = comcol_all_data_pred[!comcol_all_data_pred %in% c("Depth","Ship")])  |>
  full_join(new_SE_data, by = comcol_all_data_pred[!comcol_all_data_pred %in% c("Depth","Ship")] )
full_join: added 30 columns (tblPredatorInformationID, tblUploadID, tblHaulID, Gear, HaulNo, …)
           > rows only in dplyr::select(old_db_da..   97,412
           > rows only in dplyr::select(new_db_da..   35,476
           > matched rows                                  0
           >                                        =========
           > rows total                              132,888
full_join: added 8 columns (HaulNo.x, month, Quarter, HaulNo.y, date, …)
           > rows only in full_join(dplyr::select..  132,888
           > rows only in new_SE_data                  5,865
           > matched rows                                  0
           >                                        =========
           > rows total                              138,753

7. Estimate weight of predators using yearly lw coefficients

The size of the cod in both the new db and old db are mainly represented by lengh and the length-weight relationship in eastern baltic cod has varied over time. This will likely affect our relative prey weight estimates and we want to account for this by estimate a yearly a and b using combined data from BITS (1978-) and the stomach data (1963-).

# Model a yearly LW-relationship as a linear model of log(weight) ~ log(length). And then fit gam with a a smooth of year to smooth out the variation in a and b over time. 

# identify data to use for estimating in the stomach data 
all_data2 <- all_data |> 
  mutate(pred_weight = replace_na(pred_weight, 0),
         pred_weight_source = ifelse(pred_weight <= 0, "estimated", "observed"))
mutate: changed 91,587 values (66%) of 'pred_weight' (91,587 fewer NAs)
        new variable 'pred_weight_source' (character) with 2 unique values and 0% NA
# 66% of obs contains only length
all_data2 |> 
  summarise(percobs = n() / nrow(all_data2) * 100, .by = pred_weight_source)
summarise: now 2 rows and 2 columns, ungrouped
# A tibble: 2 × 2
  pred_weight_source percobs
  <chr>                <dbl>
1 estimated             66.1
2 observed              33.9
# load BITS data
lw_data_bits <- read_csv(paste0(home, "/data/survey/historical_lw_data.csv")) 
Rows: 414145 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): sub_div, haul_id, species
dbl (8): year, lngt_cm, ind_wgt, lon, lat, quarter, month, day

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# combine bits with stomach data on lw
lw_data_stombits <- lw_data_bits |>
  filter(species == "cod") |>
  dplyr::select(year,lngt_cm,ind_wgt) |>
  rename(Year = year,
         pred_length = lngt_cm,
         pred_weight = ind_wgt) |>
  bind_rows( all_data2 |> filter(pred_weight_source == "observed") |> dplyr::select(Year, pred_weight, pred_length))
filter: removed 112,502 rows (27%), 301,643 rows remaining
rename: renamed 3 variables (Year, pred_length, pred_weight)
filter: removed 91,779 rows (66%), 46,974 rows remaining
# fit yearly lm models
ebc_yearly_lw <- lw_data_stombits |>
  group_by(Year) |>
  group_modify( ~ tidy(lm(data = .x, log(pred_weight) ~ log(pred_length)))) |>
  pivot_wider(id_cols=Year, names_from = term, values_from = estimate ) |>
  rename(a = '(Intercept)',
         b = 'log(pred_length)') |>
  mutate(a = exp(a),
         b = b) |>
  ungroup()
group_by: one grouping variable (Year)
pivot_wider: reorganized (term, estimate, std.error, statistic, p.value) into ((Intercept), log(pred_length)) [was 122x6, now 61x3]
rename: renamed 2 variables (a, b)
mutate (grouped): changed 61 values (100%) of 'a' (0 new NAs)
ungroup: no grouping variables remain
# plot a and b over time
ebc_yearly_lw |>
  filter(a < 1) |>
  pivot_longer(cols = c(a,b), names_to = "coeff", values_to = "value") |>
  ggplot(aes(Year, value, color = coeff)) +
  geom_point() +
  facet_wrap(~coeff, scales = "free") +
  stat_smooth(method = "gam", formula = y ~ s(x, k=3)) +
  expand_limits(x=c(1960,2020)) +
  labs("using a stat_smooth gam")
filter: no rows removed
pivot_longer: reorganized (a, b) into (coeff, value) [was 61x3, now 122x3]
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.

# IF USING GAM to smooth out the large variation between years.
# # fit a gam for a and b to get smoothed predictions of a and b.
# agam <- gam(a ~ s(Year),
#             data = ebc_yearly_lw,
#             family = gaussian())
# 
# bgam <- gam(b ~ s(Year),
#             data = ebc_yearly_lw,
#             family = gaussian())
# #predict
# apred <- predict_gam(agam, values = data.frame( Year = unique(ebc_yearly_lw$Year)))
# bpred <- predict_gam(bgam,  values = data.frame( Year = unique(ebc_yearly_lw$Year)))

# combine to plot
# abpred <- bind_rows(apred |> rename(est=a) |> mutate(coeff= "a"), bpred |> rename(est=b) |> mutate(coeff= #"b"))
#
# abpred |>
#   ggplot() +
#   geom_line(aes(Year, est, color = coeff)) +
#   geom_point(data = ebc_yearly_lw |>  filter(a < 1) |>
#                pivot_longer(cols = c(a,b), names_to = "coeff", values_to = "value"), aes(Year, value, color= coeff)) +
#   geom_ribbon(aes(ymin = lower_ci, ymax = upper_ci, x=Year), color = NA, alpha = 0.3) +
#   facet_wrap(~coeff, scales = "free") +
#   expand_limits(x=c(1960,2020)) +
#   labs("modelled gam smooth with a and b from linear model, unsmooth pattern is due to predictions on exact year values I think")
# 
# join in a and b
# all_data3 <- all_data2 |>
#   left_join(apred, join_by(Year)) |>
#   left_join(bpred, join_by(Year)) 
# 
# If not using gam to smooth variation in a a nd b between years:
all_data3 <- all_data2 |>
  left_join(ebc_yearly_lw, join_by(Year))
left_join: added 2 columns (a, b)
           > rows only in all_data2            0
           > rows only in ebc_yearly_lw (      2)
           > matched rows                138,753
           >                            =========
           > rows total                  138,753
nrow(all_data3) - nrow(all_data2)
[1] 0
# test the effect of using yearly a and bs insetad of mean general cods from fishbase 
length_decade_ab <- expand.grid(length = 10:50, decade=seq(1960,2020, by=10)) |>
  left_join(ebc_yearly_lw |> mutate(decade = round(Year/10) * 10) |> ungroup() |> summarise(meana = mean(a), .by= c(decade))) |>
  left_join(ebc_yearly_lw |> mutate(decade = round(Year/10) * 10) |> ungroup() |> summarise(meanb = mean(b), .by= c(decade))) |>
  mutate(tvar_ab = meana*length^meanb,
         fb_ab = 0.00712575*length^3.098685) # mean cod fishbase a and b
mutate: new variable 'decade' (double) with 7 unique values and 0% NA
ungroup: no grouping variables remain
summarise: now 7 rows and 2 columns, ungrouped
Joining with `by = join_by(decade)`
left_join: added one column (meana)
           > rows only in expand.grid(length = 10..    0
           > rows only in summarise(ungroup(mutat.. (  0)
           > matched rows                            287
           >                                        =====
           > rows total                              287
mutate: new variable 'decade' (double) with 7 unique values and 0% NA
ungroup: no grouping variables remain
summarise: now 7 rows and 2 columns, ungrouped
Joining with `by = join_by(decade)`
left_join: added one column (meanb)
           > rows only in left_join(expand.grid(l..    0
           > rows only in summarise(ungroup(mutat.. (  0)
           > matched rows                            287
           >                                        =====
           > rows total                              287
mutate: new variable 'tvar_ab' (double) with 287 unique values and 0% NA
        new variable 'fb_ab' (double) with 41 unique values and 0% NA
# plot
length_decade_ab |>
  ggplot(aes(length, tvar_ab, color = factor(decade))) +
  geom_line() +
  geom_line(aes(length, fb_ab), color = "black", linetype = "dashed") +
  scale_color_viridis_d() +
  labs(title = "black dashed line is weight from fishbase average a and b", y = "weight [g]")
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.

# estimate weight based on year a and b and on fishbase a and b for comparison in analysis. 
all_data4 <- all_data3 |>
  mutate(pred_weight_source = ifelse(pred_weight <= 0, "estimated", "observed"),
         pred_weight = ifelse(pred_weight_source == "estimated", a*pred_length^b, pred_weight),
         pred_weight_fb = ifelse(pred_weight_source == "estimated", 0.00712575*pred_length^3.098685, pred_weight)) # mean cod fishbase a and b
mutate: changed 91,779 values (66%) of 'pred_weight' (0 new NAs)
        new variable 'pred_weight_fb' (double) with 5,700 unique values and 0% NA
# Estimated weights using time varying a and b 
all_data4 |>
  arrange(desc(pred_weight_source)) |>
  ggplot(aes(pred_length, pred_weight, color = pred_weight_source)) +
  geom_point(shape = 4)  +
  labs(title = "Estimated weights using time varying a and b", y = "weight [g]")
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.

8. Calculate relative prey weights

# Calculate relative prey weights and remove values above 1
all_data5 <- all_data4 |> 
  rowwise() |> 
  mutate(rpw_sad = saduria / pred_weight,
         rpw_spr = sprat / pred_weight,
         rpw_her = herring / pred_weight,
         rpw_other = other / pred_weight,
         rpw_other_inverts = other_inverts / pred_weight,
         rpw_other_chords = other_chords / pred_weight,
         rpw_tot = sum(other, other_inverts, other_chords, saduria, sprat, herring) / pred_weight,
         rpw_tot_fb = sum(other, other_inverts, other_chords, saduria, sprat, herring) / pred_weight_fb) |>
  filter(between(rpw_tot, 0, 1 )) |> ungroup()
mutate: new variable 'rpw_sad' (double) with 26,937 unique values and 0% NA
        new variable 'rpw_spr' (double) with 18,953 unique values and 0% NA
        new variable 'rpw_her' (double) with 9,975 unique values and 0% NA
        new variable 'rpw_other' (double) with 1,162 unique values and 0% NA
        new variable 'rpw_other_inverts' (double) with 49,220 unique values and 0% NA
        new variable 'rpw_other_chords' (double) with 10,213 unique values and 0% NA
        new variable 'rpw_tot' (double) with 81,418 unique values and 0% NA
        new variable 'rpw_tot_fb' (double) with 65,312 unique values and 0% NA
filter: removed 249 rows (<1%), 138,504 rows remaining
ungroup: no grouping variables remain
all_data5 |>  # log so we can see the data better
  dplyr::select(rpw_sad, rpw_spr, rpw_her, rpw_tot, rpw_other, rpw_other_chords, rpw_other_inverts) |> 
  #dplyr::select(rpw_tot) |> 
  pivot_longer(everything()) |> 
  ggplot(aes(log(value))) +
  geom_histogram(bins = 100) +
  facet_wrap(~name, ncol = 2, scales = "free")
pivot_longer: reorganized (rpw_sad, rpw_spr, rpw_her, rpw_tot, rpw_other, …)
into (name, value) [was 138504x7, now 969528x2]
Warning: Removed 746848 rows containing non-finite outside the scale range
(`stat_bin()`).
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.

Check the proportion of stomachs without these prey

all_data5 |> 
  pivot_longer(c("herring", "saduria", "sprat","other", "other_inverts", "other_chords")) |> 
  summarise(prop_empty = sum(value == 0)/n(), 
            prop_not_empty = sum(value != 0)/n(), .by = name)
pivot_longer: reorganized (other_inverts, other_chords, other, herring, sprat, …) into (name, value) [was 138504x68, now 831024x64]
summarise: now 6 rows and 3 columns, ungrouped
# A tibble: 6 × 3
  name          prop_empty prop_not_empty
  <chr>              <dbl>          <dbl>
1 herring            0.927        0.0725 
2 saduria            0.792        0.208  
3 sprat              0.861        0.139  
4 other              0.991        0.00858
5 other_inverts      0.569        0.431  
6 other_chords       0.922        0.0785 
all_data5 |> 
  pivot_longer(c("herring", "saduria", "sprat","other", "other_inverts", "other_chords")) |> 
  summarise(prop_empty = sum(value == 0)/n(), 
            prop_not_empty = sum(value != 0)/n(), .by = c(name, Year)) |>
  ggplot(aes(Year, prop_empty, color = name)) +
  geom_line() +
  facet_wrap(~name) +
  guides(color = "none")
pivot_longer: reorganized (other_inverts, other_chords, other, herring, sprat, …) into (name, value) [was 138504x68, now 831024x64]
summarise: now 354 rows and 4 columns, ungrouped
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.

Seems like fairly high proportions of cod without these key species in stomachs, but at least ~half have other prey in their stomachs

9. Remove points on land and outside Baltic

# Inspired by code from the map-plot function (in ...R/functions) but using terra instead of sf
# Specify map ranges
ymin = 52; ymax = 60.5; xmin = 13; xmax = 24

map_data <- rnaturalearth::ne_countries(
  scale = "medium",
  returnclass = "sv", continent = "europe")

# Crop the polygon for plotting and efficiency:
# st_bbox(map_data) # find the rough coordinates
swe_coast <- suppressWarnings(suppressMessages(
  terra::crop(map_data,
          c(xmin = xmin, ymin = ymin, xmax = xmax, ymax = ymax))))

# Change projection of the polygons and make SpatVector of the points
swe_coast_proj <- project(swe_coast, "epsg:32633") # utm_zone33
dat_vect <- vect(all_data5, geom=c("lon", "lat"))

# which points intesect with the land polygons
onland <- is.related(dat_vect, swe_coast, "intersects")

# seems correct
plot_map_fc +
  geom_point(data = all_data5[onland,] |> add_utm_columns(ll_names = c("lon", "lat"), utm_crs = 32633), aes(X*1000, Y*1000), color="red") +
  theme_sleek(base_size = 6)
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.

# remove the point on land (7500 stomachs)
all_data6 <- all_data5[!onland,]

#remove points outside the Baltic, i.e. the western most points in:
all_data6 |>
  ggplot(aes(lon, lat)) + 
  geom_point() + 
  geom_vline(xintercept = 13) +
  coord_sf()
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.

all_data6 <- all_data6 |>
  filter( lon > 13)
filter: removed 135 rows (<1%), 130,614 rows remaining

10. Last fixes, spatiotemporal distribution and save data

# add day of the year. When Day is missing (622 rows), I assume that day is 15, ie.e mid month, to not loose data for the sake of a not very important variable.
all_data7 <- all_data6 |> 
  mutate(day_of_year = if_else(!is.na(Day), 
                               yday(as.Date(paste(Year, Month, Day, sep = "-"))),
                               yday(as.Date(paste(Year, Month, 15, sep = "-")))))
mutate: new variable 'day_of_year' (double) with 277 unique values and 0% NA
# Fix some columns and select
df <- all_data7 |> 
  dplyr::select(!month) |> #remove errornous month
  rename(ices_rect = ICESrectangle,
         year = Year,
         month = Month,
         day = Day) |> 
  add_utm_columns(ll_names = c("lon", "lat"), utm_crs = 32633) |> 
  dplyr::select(pred_ID, Haul_ID, herring, saduria, sprat, other, other_inverts, other_chords, rpw_tot, rpw_tot_fb, rpw_sad, rpw_spr, rpw_her, rpw_other, rpw_other_inverts, rpw_other_chords, year, month, day, day_of_year, pred_weight, pred_weight_fb, pred_length, pred_weight_source, lat, lon, ices_rect, X, Y, data_source, Country, Depth)
rename: renamed 4 variables (ices_rect, year, month, day)
# Add sample size per coordinate for plotting
df_plot <- df |>
  group_by(year, Y, X) |> 
  mutate(sample_size = n(),
         pos_id = paste(year, X, Y),
         decade = round(year/10) * 10) |> 
  ungroup() |> 
  distinct(pos_id, .keep_all = TRUE)
group_by: 3 grouping variables (year, Y, X)
mutate (grouped): new variable 'sample_size' (integer) with 299 unique values and 0% NA
                  new variable 'pos_id' (character) with 2,734 unique values and 0% NA
                  new variable 'decade' (double) with 7 unique values and 0% NA
ungroup: no grouping variables remain
distinct: removed 127,880 rows (98%), 2,734 rows remaining
plot_map_fc +
  geom_point(data = df_plot, aes(X*1000, Y*1000, size = sample_size), alpha = 0.5) +
  facet_wrap(~ decade, ncol = 4) +
  theme(legend.position = "bottom",
        axis.text.x = element_text(angle = 90)) +
  scale_size(range = c(.01, 2), name = "# stomachs") +
  geom_sf()
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).

ggsave(paste0(home, "/figures/supp/year_diet_map.pdf"), width = 15, height = 15, units = "cm")
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
Warning in plot_theme(plot): The `tagger.panel.tag.text` theme element is not defined in the element
hierarchy.
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).

check for duplicates

# zero dupes
df |>
  summarise(count = n(), .by = c(pred_ID, data_source, year, Country)) |>
  filter(count > 1)
summarise: now 130,614 rows and 5 columns, ungrouped
filter: removed all rows (100%)
# A tibble: 0 × 5
# ℹ 5 variables: pred_ID <chr>, data_source <chr>, year <dbl>, Country <chr>,
#   count <int>
# Save data
# v2 as of 2025-06-26
write_csv(df, paste0(home, "/data/stomach/stomachs_v2.csv")) #